{
  "version": "1.0.0",
  "generated": "2026-05-12",
  "count": 73,
  "benchmarks": [
    {
      "id": "open-targets",
      "name": "Open Targets Platform",
      "stages": [
        "disease-modeling",
        "target-id"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "target-disease-association",
        "prioritization"
      ],
      "description": "Integrates genetic, genomic, pharmacological evidence to score targets for 20k+ diseases.",
      "size": {
        "targets": 63000,
        "diseases": 28000,
        "evidence": 18000000
      },
      "primary_paper": {
        "title": "The Open Targets Platform: supporting systematic drug-target identification and prioritisation",
        "authors": [
          "Ochoa D",
          "Hercules A",
          "Carmona M",
          "et al."
        ],
        "year": 2021,
        "doi": "10.1093/nar/gkaa1027",
        "citations": 1100
      },
      "official_url": "https://platform.opentargets.org/",
      "github_url": "https://github.com/opentargets",
      "leaderboard_url": "N/A",
      "license": "CC0 / Apache-2.0",
      "first_release": "2016-12",
      "last_updated": "2025-06",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Industry gold standard for target prioritization. Quarterly versioned releases.",
      "related_benchmarks": [
        "depmap",
        "disgenet",
        "primekg"
      ],
      "expert_ids": [
        "ian-dunham",
        "ellen-mcdonagh"
      ],
      "group_ids": [
        "opentargets",
        "embl-ebi",
        "gsk",
        "sanger"
      ],
      "hosted_by": [
        "elixir"
      ],
      "composite_score": 100.0
    },
    {
      "id": "depmap",
      "name": "DepMap (Cancer Dependency Map)",
      "stages": [
        "target-id",
        "disease-modeling"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "essentiality",
        "biomarker"
      ],
      "description": "Genome-scale CRISPR/RNAi essentiality across 1150 cancer cell lines + omics.",
      "size": {
        "cell_lines": 1150,
        "genes": 18000
      },
      "primary_paper": {
        "title": "Defining a Cancer Dependency Map",
        "authors": [
          "Tsherniak A",
          "Vazquez F",
          "Montgomery PG",
          "et al."
        ],
        "year": 2017,
        "doi": "10.1016/j.cell.2017.06.010",
        "citations": 2600
      },
      "official_url": "https://depmap.org/portal/",
      "github_url": "https://github.com/broadinstitute/depmap",
      "leaderboard_url": "https://depmap.org/portal/prediction/",
      "license": "CC-BY 4.0",
      "first_release": "2017-07",
      "last_updated": "2025-06",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Quarterly release cadence.",
      "related_benchmarks": [
        "open-targets",
        "lincs-l1000"
      ],
      "expert_ids": [
        "aviad-tsherniak",
        "william-hahn",
        "todd-golub"
      ],
      "group_ids": [
        "broad-depmap"
      ],
      "hosted_by": [],
      "composite_score": 100.0
    },
    {
      "id": "tdc-admet",
      "name": "TDC ADMET Group",
      "stages": [
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "regression",
        "classification"
      ],
      "description": "22-task ADMET benchmark suite with scaffold splits \u2014 core TDC leaderboard.",
      "size": {
        "tasks": 22,
        "molecules": 130000
      },
      "primary_paper": {
        "title": "Therapeutics Data Commons: Machine Learning Datasets and Tasks for Drug Discovery and Development",
        "authors": [
          "Huang K",
          "Fu T",
          "Gao W",
          "et al."
        ],
        "year": 2021,
        "doi": "10.48550/arXiv.2102.09548",
        "citations": 620
      },
      "official_url": "https://tdcommons.ai/benchmark/admet_group/overview/",
      "github_url": "https://github.com/mims-harvard/TDC",
      "leaderboard_url": "https://tdcommons.ai/benchmark/admet_group/overview/",
      "license": "MIT",
      "first_release": "2021-02",
      "last_updated": "2025-04",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Most-adopted ADMET benchmark. 100+ leaderboard submissions.",
      "related_benchmarks": [
        "moleculenet",
        "admet-ai",
        "polaris-admet",
        "molecule-ace"
      ],
      "expert_ids": [
        "kexin-huang",
        "marinka-zitnik",
        "tianfan-fu"
      ],
      "group_ids": [
        "zitnik-lab",
        "tdc"
      ],
      "hosted_by": [
        "tdc",
        "insilico-scienceaibench",
        "insilico-ddb"
      ],
      "composite_score": 100.0
    },
    {
      "id": "sabdab",
      "name": "SAbDab",
      "stages": [
        "hit-id",
        "lead-id-admet",
        "developmental-candidate"
      ],
      "modalities": [
        "biologic-mab"
      ],
      "task_types": [
        "antibody-structure",
        "affinity"
      ],
      "description": "Structural antibody database \u2014 curated PDB antibody structures with annotation.",
      "size": {
        "structures": 9500,
        "complexes": 2600
      },
      "primary_paper": {
        "title": "SAbDab: the structural antibody database",
        "authors": [
          "Dunbar J",
          "Krawczyk K",
          "Leem J",
          "et al."
        ],
        "year": 2014,
        "doi": "10.1093/nar/gkt1043",
        "citations": 630
      },
      "official_url": "https://opig.stats.ox.ac.uk/webapps/newsabdab/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "CC-BY 4.0",
      "first_release": "2013",
      "last_updated": "2025-04",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Canonical antibody structure resource. Weekly updates.",
      "related_benchmarks": [
        "oas",
        "cov-abdab",
        "iglm-bench"
      ],
      "expert_ids": [
        "charlotte-deane",
        "james-dunbar"
      ],
      "group_ids": [
        "oxford-opig"
      ],
      "hosted_by": [],
      "composite_score": 100.0
    },
    {
      "id": "chembl",
      "name": "ChEMBL",
      "stages": [
        "hit-id",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule",
        "protein-general"
      ],
      "task_types": [
        "bioactivity",
        "data-resource"
      ],
      "description": "Manually curated bioactive molecule DB; backbone for most ML chemistry benchmarks.",
      "size": {
        "compounds": 2400000,
        "activities": 20700000,
        "targets": 15398
      },
      "primary_paper": {
        "title": "The ChEMBL Database in 2023",
        "authors": [
          "Zdrazil B",
          "Felix E",
          "Hunter F",
          "et al."
        ],
        "year": 2024,
        "doi": "10.1093/nar/gkad1004",
        "citations": 800
      },
      "official_url": "https://www.ebi.ac.uk/chembl/",
      "github_url": "https://github.com/chembl",
      "leaderboard_url": "N/A",
      "license": "CC-BY-SA 3.0",
      "first_release": "2009",
      "last_updated": "2025-05",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Underlies ~80% of public bioactivity ML benchmarks.",
      "related_benchmarks": [
        "tdc-admet",
        "moleculenet",
        "pubchem-bioassay"
      ],
      "expert_ids": [
        "andrew-leach",
        "barbara-zdrazil"
      ],
      "group_ids": [
        "embl-ebi"
      ],
      "hosted_by": [
        "elixir"
      ],
      "composite_score": 97.5
    },
    {
      "id": "oas",
      "name": "Observed Antibody Space (OAS)",
      "stages": [
        "hit-id",
        "lead-id-admet"
      ],
      "modalities": [
        "biologic-mab"
      ],
      "task_types": [
        "antibody-sequence"
      ],
      "description": "Repository of >1B antibody BCR sequences from public repertoires \u2014 core for antibody LM pretraining.",
      "size": {
        "sequences": 2400000000,
        "repertoires": 15000
      },
      "primary_paper": {
        "title": "Observed Antibody Space: A diverse database of cleaned, annotated, and translated unpaired and paired antibody sequences",
        "authors": [
          "Olsen TH",
          "Boyles F",
          "Deane CM"
        ],
        "year": 2022,
        "doi": "10.1002/pro.4205",
        "citations": 310
      },
      "official_url": "https://opig.stats.ox.ac.uk/webapps/oas/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "CC-BY 4.0",
      "first_release": "2018",
      "last_updated": "2025-04",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Underlies AbLang, IgLM, AntiBERTa \u2014 industry-adopted.",
      "related_benchmarks": [
        "sabdab",
        "cov-abdab",
        "iglm-bench"
      ],
      "expert_ids": [
        "charlotte-deane",
        "tobias-olsen"
      ],
      "group_ids": [
        "oxford-opig"
      ],
      "hosted_by": [],
      "composite_score": 97.5
    },
    {
      "id": "proteingym",
      "name": "ProteinGym",
      "stages": [
        "target-id",
        "lead-id-admet",
        "ind-enabling"
      ],
      "modalities": [
        "protein-general"
      ],
      "task_types": [
        "variant-effect"
      ],
      "description": "217 DMS substitution assays + indel + clinical variants \u2014 de facto standard for VEPs.",
      "size": {
        "dms_assays": 217,
        "mutations": 2700000,
        "clinical_variants": 2525
      },
      "primary_paper": {
        "title": "ProteinGym: Large-Scale Benchmarks for Protein Fitness Prediction and Design",
        "authors": [
          "Notin P",
          "Kollasch A",
          "Ritter D",
          "et al."
        ],
        "year": 2023,
        "doi": "10.48550/arXiv.2305.06259",
        "citations": 320
      },
      "official_url": "https://proteingym.org/",
      "github_url": "https://github.com/OATML-Markslab/ProteinGym",
      "leaderboard_url": "https://proteingym.org/benchmarks",
      "license": "MIT",
      "first_release": "2022",
      "last_updated": "2025-03",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Field standard. Clinical track enables fair ESM/EVE/AlphaMissense comparison.",
      "related_benchmarks": [
        "flip",
        "cafa-benchmark"
      ],
      "expert_ids": [
        "debora-marks",
        "pascal-notin",
        "yarin-gal"
      ],
      "group_ids": [
        "marks-lab",
        "oatml"
      ],
      "hosted_by": [
        "proteingym"
      ],
      "composite_score": 97.5
    },
    {
      "id": "posebusters",
      "name": "PoseBusters",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "small-molecule",
        "protein-general"
      ],
      "task_types": [
        "pose-validation"
      ],
      "description": "Physics-aware eval of docking/co-folding poses \u2014 19 checks catching chemically impossible outputs.",
      "size": {
        "complexes": 428,
        "checks_per_pose": 19
      },
      "primary_paper": {
        "title": "PoseBusters: AI-based docking methods fail to generate physically valid poses or generalise to novel sequences",
        "authors": [
          "Buttenschoen M",
          "Morris GM",
          "Deane CM"
        ],
        "year": 2024,
        "doi": "10.1039/D3SC04185A",
        "citations": 360
      },
      "official_url": "https://posebusters.readthedocs.io/",
      "github_url": "https://github.com/maabuu/posebusters",
      "leaderboard_url": "https://github.com/maabuu/posebusters",
      "license": "BSD-3-Clause",
      "first_release": "2023-08",
      "last_updated": "2025-02",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Exposed major failure modes in AlphaFold-Multimer/DiffDock/RFAA. Default pharma filter.",
      "related_benchmarks": [
        "plinder",
        "pinder",
        "casf-2016"
      ],
      "expert_ids": [
        "charlotte-deane",
        "martin-buttenschoen"
      ],
      "group_ids": [
        "oxford-opig"
      ],
      "hosted_by": [
        "posebusters-initiative"
      ],
      "composite_score": 97.0
    },
    {
      "id": "plinder",
      "name": "PLINDER",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "small-molecule",
        "protein-general"
      ],
      "task_types": [
        "docking",
        "structure-based-benchmark"
      ],
      "description": "Leakage-aware protein-ligand interaction dataset with stratified splits.",
      "size": {
        "complexes": 1400000,
        "unique_systems": 460000
      },
      "primary_paper": {
        "title": "PLINDER: The protein-ligand interactions dataset and evaluation resource",
        "authors": [
          "Durairaj J",
          "Adeshina Y",
          "Cao Z",
          "et al."
        ],
        "year": 2024,
        "doi": "10.48550/arXiv.2409.17475",
        "citations": 55
      },
      "official_url": "https://www.plinder.sh/",
      "github_url": "https://github.com/plinder-org/plinder",
      "leaderboard_url": "N/A",
      "license": "CC-BY 4.0",
      "first_release": "2024-09",
      "last_updated": "2025-03",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Replaces PDBbind as the modern leakage-controlled docking standard.",
      "related_benchmarks": [
        "pdbbind",
        "pinder",
        "posebusters"
      ],
      "expert_ids": [
        "torsten-schwede",
        "max-jaderberg"
      ],
      "group_ids": [
        "biozentrum-basel",
        "isomorphic-labs",
        "vantai"
      ],
      "hosted_by": [
        "plinder-initiative"
      ],
      "composite_score": 97.0
    },
    {
      "id": "stringdb",
      "name": "STRING",
      "stages": [
        "target-id",
        "disease-modeling"
      ],
      "modalities": [
        "protein-general"
      ],
      "task_types": [
        "ppi",
        "network-inference"
      ],
      "description": "Protein-protein interaction & functional association network across 12k organisms.",
      "size": {
        "proteins": 67000000,
        "associations": 20000000000,
        "organisms": 12535
      },
      "primary_paper": {
        "title": "The STRING database in 2023",
        "authors": [
          "Szklarczyk D",
          "Kirsch R",
          "Koutrouli M",
          "et al."
        ],
        "year": 2023,
        "doi": "10.1093/nar/gkac1000",
        "citations": 2800
      },
      "official_url": "https://string-db.org/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "CC-BY 4.0",
      "first_release": "2000",
      "last_updated": "2024-11",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Workhorse for network-based target ID. Distinguish functional vs physical edges.",
      "related_benchmarks": [
        "open-targets",
        "primekg"
      ],
      "expert_ids": [
        "christian-von-mering"
      ],
      "group_ids": [
        "sib-swiss"
      ],
      "hosted_by": [
        "elixir"
      ],
      "composite_score": 94.9
    },
    {
      "id": "casp15",
      "name": "CASP15",
      "stages": [
        "hit-id",
        "target-id"
      ],
      "modalities": [
        "protein-general"
      ],
      "task_types": [
        "structure-prediction"
      ],
      "description": "CASP15 blind structure prediction including multimer and ligand-bound categories.",
      "size": {
        "targets": 127,
        "categories": 5
      },
      "primary_paper": {
        "title": "Critical assessment of methods of protein structure prediction (CASP) \u2014 Round XV",
        "authors": [
          "Kryshtafovych A",
          "Schwede T",
          "Topf M",
          "et al."
        ],
        "year": 2023,
        "doi": "10.1002/prot.26617",
        "citations": 140
      },
      "official_url": "https://predictioncenter.org/casp15/",
      "github_url": "N/A",
      "leaderboard_url": "https://predictioncenter.org/casp15/results",
      "license": "Public",
      "first_release": "2022",
      "last_updated": "2023",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 3,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Biennial. Introduced ligand prediction category.",
      "related_benchmarks": [
        "casp16",
        "cameo-targets"
      ],
      "expert_ids": [
        "andriy-kryshtafovych",
        "john-moult",
        "torsten-schwede"
      ],
      "group_ids": [
        "prediction-center-ucd"
      ],
      "hosted_by": [
        "casp"
      ],
      "composite_score": 94.9
    },
    {
      "id": "casp16",
      "name": "CASP16",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "protein-general",
        "small-molecule"
      ],
      "task_types": [
        "structure-prediction",
        "ligand-pose"
      ],
      "description": "CASP16 (2024) \u2014 expanded multimer, RNA, and ligand prediction.",
      "size": {
        "targets": 140,
        "categories": 6
      },
      "primary_paper": {
        "title": "CASP16 preliminary overview",
        "authors": "CASP16 assessors",
        "year": 2024,
        "doi": "N/A \u2014 overview papers pending",
        "citations": 15
      },
      "official_url": "https://predictioncenter.org/casp16/",
      "github_url": "N/A",
      "leaderboard_url": "https://predictioncenter.org/casp16/results",
      "license": "Public",
      "first_release": "2024",
      "last_updated": "2024-12",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "First full multimer+ligand+RNA joint eval.",
      "related_benchmarks": [
        "casp15"
      ],
      "expert_ids": [
        "andriy-kryshtafovych",
        "john-moult"
      ],
      "group_ids": [
        "prediction-center-ucd"
      ],
      "hosted_by": [
        "casp"
      ],
      "composite_score": 94.4
    },
    {
      "id": "cameo-targets",
      "name": "CAMEO weekly targets",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "protein-general",
        "small-molecule"
      ],
      "task_types": [
        "structure-prediction",
        "ligand-pose"
      ],
      "description": "Continuous weekly blind eval using pre-release PDB structures \u2014 3D, multimer, ligand pocket.",
      "size": {
        "targets_per_year": 1000,
        "categories": 4
      },
      "primary_paper": {
        "title": "CAMEO: continuous evaluation of computational biology methods",
        "authors": [
          "Haas J",
          "Roth S",
          "Arnold K",
          "et al."
        ],
        "year": 2013,
        "doi": "10.1093/database/bat031",
        "citations": 220
      },
      "official_url": "https://www.cameo3d.org/",
      "github_url": "N/A",
      "leaderboard_url": "https://www.cameo3d.org/",
      "license": "CC-BY 4.0",
      "first_release": "2013",
      "last_updated": "2025-05",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Weekly cadence complements biennial CASP.",
      "related_benchmarks": [
        "casp15",
        "casp16"
      ],
      "expert_ids": [
        "torsten-schwede",
        "jurgen-haas"
      ],
      "group_ids": [
        "biozentrum-basel"
      ],
      "hosted_by": [
        "cameo"
      ],
      "composite_score": 94.4
    },
    {
      "id": "ord-bench",
      "name": "ORD Reaction Benchmark",
      "stages": [
        "developmental-candidate"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "reaction-prediction",
        "yield"
      ],
      "description": "Reaction benchmarks derived from the Open Reaction Database (yield prediction, condition recommendation).",
      "size": {
        "reactions": 2100000,
        "tasks": 4
      },
      "primary_paper": {
        "title": "The Open Reaction Database",
        "authors": [
          "Kearnes SM",
          "Maser MR",
          "Wleklinski M",
          "et al."
        ],
        "year": 2021,
        "doi": "10.1021/jacs.1c09820",
        "citations": 210
      },
      "official_url": "https://open-reaction-database.org/",
      "github_url": "https://github.com/open-reaction-database",
      "leaderboard_url": "N/A",
      "license": "CC-BY-SA 4.0",
      "first_release": "2021-07",
      "last_updated": "2025-04",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Modern open reaction corpus; industry-scale.",
      "related_benchmarks": [
        "uspto-retrosyn"
      ],
      "expert_ids": [
        "steven-kearnes",
        "abigail-doyle",
        "connor-coley"
      ],
      "group_ids": [
        "google-research",
        "princeton-chemistry",
        "coley-lab"
      ],
      "hosted_by": [
        "open-reaction-database"
      ],
      "composite_score": 93.9
    },
    {
      "id": "openproblems-perturbation",
      "name": "Open Problems: Perturbation Prediction",
      "stages": [
        "virtual-cell"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "perturbation-prediction"
      ],
      "description": "NeurIPS competition & continuing benchmark for single-cell perturbation response prediction under distribution shift.",
      "size": {
        "cells": 240000,
        "perturbations": 144
      },
      "primary_paper": {
        "title": "Predicting Cellular Responses to Novel Drug Perturbations at a Single-Cell Resolution",
        "authors": [
          "Hetzel L",
          "Boehm S",
          "Kilbertus N",
          "et al."
        ],
        "year": 2022,
        "doi": "10.48550/arXiv.2204.13545",
        "citations": 180
      },
      "official_url": "https://openproblems.bio/",
      "github_url": "https://github.com/openproblems-bio/openproblems",
      "leaderboard_url": "https://openproblems.bio/results",
      "license": "MIT",
      "first_release": "2021-06",
      "last_updated": "2024-12",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [],
      "notes": "Best-in-class rigor (Viash workflow, hidden test, NeurIPS track).",
      "related_benchmarks": [
        "cz-virtual-cell-challenge",
        "scperturb",
        "lincs-l1000"
      ],
      "expert_ids": [
        "fabian-theis",
        "daniel-burkhardt",
        "malte-luecken"
      ],
      "group_ids": [
        "openproblems",
        "helmholtz-munich",
        "czi-science"
      ],
      "hosted_by": [
        "openproblems",
        "czi-virtual-cell"
      ],
      "composite_score": 91.9
    },
    {
      "id": "primekg",
      "name": "PrimeKG",
      "stages": [
        "disease-modeling",
        "target-id"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "knowledge-graph",
        "drug-repurposing"
      ],
      "description": "Precision medicine knowledge graph integrating 20 sources for 17k diseases.",
      "size": {
        "nodes": 129375,
        "edges": 8100498,
        "diseases": 17080
      },
      "primary_paper": {
        "title": "Building a knowledge graph to enable precision medicine",
        "authors": [
          "Chandak P",
          "Huang K",
          "Zitnik M"
        ],
        "year": 2023,
        "doi": "10.1038/s41597-023-01960-3",
        "citations": 320
      },
      "official_url": "https://zitniklab.hms.harvard.edu/projects/PrimeKG/",
      "github_url": "https://github.com/mims-harvard/PrimeKG",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2023-03",
      "last_updated": "2025-02",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Modern, well-engineered KG; strong for GNN drug repurposing.",
      "related_benchmarks": [
        "open-targets",
        "disgenet"
      ],
      "expert_ids": [
        "marinka-zitnik",
        "payal-chandak",
        "kexin-huang"
      ],
      "group_ids": [
        "zitnik-lab"
      ],
      "hosted_by": [
        "tdc"
      ],
      "composite_score": 91.9
    },
    {
      "id": "faers-bench",
      "name": "FAERS (raw)",
      "stages": [
        "post-market-rwe"
      ],
      "modalities": [
        "small-molecule",
        "biologic-mab"
      ],
      "task_types": [
        "pharmacovigilance"
      ],
      "description": "FDA Adverse Event Reporting System \u2014 19M+ reports used as substrate for signal-detection ML.",
      "size": {
        "reports": 19000000
      },
      "primary_paper": {
        "title": "An assessment of the U.S. FDA Adverse Event Reporting System (FAERS) and the impact of quality reporting",
        "authors": [
          "Sakaeda T",
          "Tamon A",
          "Kadoyama K",
          "Okuno Y"
        ],
        "year": 2013,
        "doi": "10.3390/ijerph100300796",
        "citations": 780
      },
      "official_url": "https://open.fda.gov/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "Public domain",
      "first_release": "1969",
      "last_updated": "2025-Q2",
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Known under-/over-reporting biases.",
      "related_benchmarks": [
        "offsides-twosides",
        "sider"
      ],
      "expert_ids": [
        "FDA CDER"
      ],
      "group_ids": [
        "fda-cder"
      ],
      "hosted_by": [
        "faers"
      ],
      "composite_score": 91.1
    },
    {
      "id": "longevity-bench-insilico",
      "name": "Longevity Benchmark (Insilico)",
      "stages": [
        "disease-modeling",
        "target-id",
        "post-market-rwe"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "aging-prediction",
        "survival",
        "biomarker"
      ],
      "description": "19 aging/longevity benchmarks on ScienceAIBench/InsilicoBench/DDB \u2014 NHANES mortality, TCGA survival, methylation age, GTEx, Olink proteomic, longevity synergy.",
      "size": {
        "benchmarks": 19,
        "datasets": [
          "NHANES",
          "TCGA",
          "Methylation",
          "GTEx",
          "Olink"
        ]
      },
      "primary_paper": {
        "title": "Longevity Benchmark methodology",
        "authors": "Insilico AI Team",
        "year": 2025,
        "doi": "N/A \u2014 portal",
        "citations": 12
      },
      "official_url": "https://insilicobench.insilico.com/",
      "github_url": "N/A",
      "leaderboard_url": "https://insilicobench.insilico.com/",
      "license": "CC-BY",
      "first_release": "2025-02",
      "last_updated": "2026-04",
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 4,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Unique, broad longevity/aging benchmark slice \u2014 nothing else in the field covers aging comparably. Leaderboard features frontier LLMs.",
      "related_benchmarks": [
        "depmap",
        "mimic-benchmark"
      ],
      "expert_ids": [
        "alex-zhavoronkov",
        "alex-aliper",
        "alex-zhebrak"
      ],
      "group_ids": [
        "insilico-medicine"
      ],
      "hosted_by": [
        "insilico-scienceaibench",
        "insilico-insilicobench",
        "insilico-ddb"
      ],
      "composite_score": 90.6
    },
    {
      "id": "lincs-l1000",
      "name": "LINCS L1000 / CMap",
      "stages": [
        "virtual-cell",
        "disease-modeling",
        "target-id"
      ],
      "modalities": [
        "small-molecule",
        "cross-modality"
      ],
      "task_types": [
        "signature-match",
        "mechanism-of-action"
      ],
      "description": "1.3M transcriptional profiles across 978 landmark genes after genetic or chemical perturbations.",
      "size": {
        "profiles": 1319138,
        "compounds": 33000,
        "cell_lines": 82
      },
      "primary_paper": {
        "title": "A Next Generation Connectivity Map: L1000 Platform and the First 1,000,000 Profiles",
        "authors": [
          "Subramanian A",
          "Narayan R",
          "Corsello SM",
          "et al."
        ],
        "year": 2017,
        "doi": "10.1016/j.cell.2017.10.049",
        "citations": 3700
      },
      "official_url": "https://clue.io/",
      "github_url": "https://github.com/cmap",
      "leaderboard_url": "N/A",
      "license": "CC-BY",
      "first_release": "2017-11",
      "last_updated": "2024-03",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 3,
        "adoption": 5,
        "quality": 4,
        "accessibility": 4,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Foundational pharma resource for MoA work. Batch effects require careful handling.",
      "related_benchmarks": [
        "scperturb",
        "depmap"
      ],
      "expert_ids": [
        "aravind-subramanian",
        "todd-golub"
      ],
      "group_ids": [
        "broad-cmap"
      ],
      "hosted_by": [],
      "composite_score": 89.9
    },
    {
      "id": "cansar",
      "name": "canSAR",
      "stages": [
        "target-id",
        "hit-id"
      ],
      "modalities": [
        "small-molecule",
        "protein-general"
      ],
      "task_types": [
        "druggability",
        "target-annotation"
      ],
      "description": "Cancer translational knowledge base integrating pharmacology, bioactivity, structure, druggability.",
      "size": {
        "proteins": 25000,
        "compounds": 11000000,
        "bioactivity_datapoints": 106000000
      },
      "primary_paper": {
        "title": "canSAR: update to the cancer translational research and drug discovery knowledgebase",
        "authors": [
          "Mitsopoulos C",
          "Di Micco P",
          "et al."
        ],
        "year": 2021,
        "doi": "10.1093/nar/gkaa1059",
        "citations": 210
      },
      "official_url": "https://cansar.ai/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "Academic free / commercial tier",
      "first_release": "2011",
      "last_updated": "2024-09",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 3,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Deep oncology focus; widely-used druggability predictor.",
      "related_benchmarks": [
        "open-targets",
        "depmap",
        "chembl"
      ],
      "expert_ids": [
        "bissan-al-lazikani"
      ],
      "group_ids": [
        "icr-london"
      ],
      "hosted_by": [],
      "composite_score": 89.4
    },
    {
      "id": "mimic-benchmark",
      "name": "MIMIC-IV Benchmark Tasks",
      "stages": [
        "phase-3",
        "clinical-development",
        "post-market-rwe"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "clinical-outcome"
      ],
      "description": "Standardized ICU benchmarks on MIMIC-IV \u2014 mortality, LOS, sepsis, AKI, drug dosing.",
      "size": {
        "patients": 299712,
        "tasks": 14
      },
      "primary_paper": {
        "title": "MIMIC-IV, a freely accessible electronic health record dataset",
        "authors": [
          "Johnson AEW",
          "Bulgarelli L",
          "Shen L",
          "et al."
        ],
        "year": 2023,
        "doi": "10.1038/s41597-022-01899-x",
        "citations": 620
      },
      "official_url": "https://physionet.org/content/mimiciv/",
      "github_url": "https://github.com/MIT-LCP/mimic-code",
      "leaderboard_url": "N/A",
      "license": "PhysioNet Credentialed",
      "first_release": "2020",
      "last_updated": "2025-02",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 3,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Canonical clinical ML benchmark. Credentialed access limits casual use.",
      "related_benchmarks": [
        "ctod"
      ],
      "expert_ids": [
        "alistair-johnson",
        "leo-anthony-celi",
        "roger-mark"
      ],
      "group_ids": [
        "mit-lcp"
      ],
      "hosted_by": [
        "mimic"
      ],
      "composite_score": 89.4
    },
    {
      "id": "scperturb",
      "name": "scPerturb",
      "stages": [
        "virtual-cell",
        "target-id"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "perturbation-prediction"
      ],
      "description": "Harmonized single-cell perturbation datasets (genetic + chemical) with standardized metadata.",
      "size": {
        "cells": 1700000,
        "datasets": 44
      },
      "primary_paper": {
        "title": "scPerturb: Harmonized single-cell perturbation data",
        "authors": [
          "Peidli S",
          "Green TD",
          "Shen C",
          "Theis FJ",
          "et al."
        ],
        "year": 2024,
        "doi": "10.1038/s41592-023-02144-y",
        "citations": 280
      },
      "official_url": "http://projects.sanderlab.org/scperturb/",
      "github_url": "https://github.com/sanderlab/scPerturb",
      "leaderboard_url": "N/A \u2014 dataset resource",
      "license": "MIT",
      "first_release": "2023-05",
      "last_updated": "2024-11",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Canonical harmonized resource. Strong Perturb-seq coverage; weaker for chemical perturbations.",
      "related_benchmarks": [
        "cz-virtual-cell-challenge",
        "openproblems-perturbation",
        "lincs-l1000"
      ],
      "expert_ids": [
        "chris-sander",
        "fabian-theis"
      ],
      "group_ids": [
        "sander-lab",
        "helmholtz-munich"
      ],
      "hosted_by": [],
      "composite_score": 88.9
    },
    {
      "id": "pinder",
      "name": "PINDER",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "protein-general"
      ],
      "task_types": [
        "protein-protein-docking"
      ],
      "description": "Protein-protein docking benchmark with rigorous split design.",
      "size": {
        "dimers": 2319564,
        "systems": 267498
      },
      "primary_paper": {
        "title": "PINDER: The Protein Interaction Dataset and Evaluation Resource",
        "authors": [
          "Kovtun D",
          "Akdel M",
          "Goncearenco A",
          "et al."
        ],
        "year": 2024,
        "doi": "10.1101/2024.07.17.603980",
        "citations": 45
      },
      "official_url": "https://www.pinder.sh/",
      "github_url": "https://github.com/pinder-org/pinder",
      "leaderboard_url": "N/A",
      "license": "CC-BY 4.0",
      "first_release": "2024-07",
      "last_updated": "2025-03",
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 4,
        "adoption": 3,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Expected PPI docking standard.",
      "related_benchmarks": [
        "plinder",
        "capri-benchmark"
      ],
      "expert_ids": [
        "torsten-schwede"
      ],
      "group_ids": [
        "vantai",
        "biozentrum-basel"
      ],
      "hosted_by": [
        "plinder-initiative"
      ],
      "composite_score": 88.9
    },
    {
      "id": "pmo",
      "name": "Practical Molecular Optimization (PMO)",
      "stages": [
        "lead-id-admet",
        "developmental-candidate"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "molecule-generation",
        "oracle"
      ],
      "description": "Budget-aware benchmark for goal-directed molecule optimization with 23 oracle functions.",
      "size": {
        "oracles": 23,
        "budget": 10000
      },
      "primary_paper": {
        "title": "Sample Efficiency Matters: A Benchmark for Practical Molecular Optimization",
        "authors": [
          "Gao W",
          "Fu T",
          "Sun J",
          "Coley CW"
        ],
        "year": 2022,
        "doi": "10.48550/arXiv.2206.12411",
        "citations": 260
      },
      "official_url": "https://github.com/wenhao-gao/mol_opt",
      "github_url": "https://github.com/wenhao-gao/mol_opt",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2022-06",
      "last_updated": "2024-10",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Sample-efficiency focus exposed shortcomings of reward-maxing methods.",
      "related_benchmarks": [
        "guacamol",
        "moses",
        "tdc-admet"
      ],
      "expert_ids": [
        "wenhao-gao",
        "tianfan-fu",
        "connor-coley"
      ],
      "group_ids": [
        "mit-csail",
        "coley-lab"
      ],
      "hosted_by": [
        "tdc"
      ],
      "composite_score": 88.9
    },
    {
      "id": "cov-abdab",
      "name": "CoV-AbDab",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "biologic-mab"
      ],
      "task_types": [
        "antibody-antigen-binding"
      ],
      "description": "Coronavirus antibody database \u2014 binding & neutralization annotations for SARS-CoV-2/MERS/etc.",
      "size": {
        "antibodies": 12000
      },
      "primary_paper": {
        "title": "CoV-AbDab: the coronavirus antibody database",
        "authors": [
          "Raybould MIJ",
          "Kovaltsuk A",
          "Marks C",
          "Deane CM"
        ],
        "year": 2021,
        "doi": "10.1093/bioinformatics/btaa739",
        "citations": 220
      },
      "official_url": "https://opig.stats.ox.ac.uk/webapps/covabdab/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "CC-BY 4.0",
      "first_release": "2020",
      "last_updated": "2024-12",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Narrow modality but critical for pandemic-preparedness ML.",
      "related_benchmarks": [
        "sabdab",
        "oas"
      ],
      "expert_ids": [
        "matthew-raybould",
        "charlotte-deane"
      ],
      "group_ids": [
        "oxford-opig"
      ],
      "hosted_by": [],
      "composite_score": 88.9
    },
    {
      "id": "pubchem-bioassay",
      "name": "PubChem BioAssay",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "bioactivity"
      ],
      "description": "NIH PubChem's screening assay repository.",
      "size": {
        "assays": 1700000,
        "compounds": 114000000
      },
      "primary_paper": {
        "title": "PubChem 2023 update",
        "authors": [
          "Kim S",
          "Chen J",
          "Cheng T",
          "et al."
        ],
        "year": 2023,
        "doi": "10.1093/nar/gkac956",
        "citations": 1200
      },
      "official_url": "https://pubchem.ncbi.nlm.nih.gov/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "Public domain",
      "first_release": "2004",
      "last_updated": "2025-05",
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Broadest HTS repository; quality heterogeneous.",
      "related_benchmarks": [
        "chembl",
        "moleculenet"
      ],
      "expert_ids": [
        "sunghwan-kim"
      ],
      "group_ids": [
        "nih-ncbi"
      ],
      "hosted_by": [],
      "composite_score": 88.6
    },
    {
      "id": "polaris-admet",
      "name": "Polaris ADMET",
      "stages": [
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "regression",
        "classification"
      ],
      "description": "Industry-contributed ADMET benchmarks on Polaris Hub (Novartis/AstraZeneca/Polaris SC curated endpoints).",
      "size": {
        "endpoints": 12,
        "molecules": 50000
      },
      "primary_paper": {
        "title": "Polaris: method comparison in drug discovery",
        "authors": [
          "Wognum C",
          "Noutahi E",
          "Hsu J",
          "et al."
        ],
        "year": 2024,
        "doi": "N/A \u2014 working paper",
        "citations": 30
      },
      "official_url": "https://polarishub.io/benchmarks",
      "github_url": "https://github.com/polaris-hub/polaris",
      "leaderboard_url": "https://polarishub.io/benchmarks",
      "license": "Polaris Community",
      "first_release": "2023-10",
      "last_updated": "2025-04",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 3,
        "quality": 5,
        "accessibility": 4,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Industry splits enforce blinded eval; highest industry relevance among ADMET benchmarks.",
      "related_benchmarks": [
        "tdc-admet",
        "moleculenet"
      ],
      "expert_ids": [
        "cas-wognum",
        "emmanuel-noutahi",
        "jonathan-hsu"
      ],
      "group_ids": [
        "valence-labs",
        "recursion",
        "novartis-nibr",
        "astrazeneca"
      ],
      "hosted_by": [
        "polaris"
      ],
      "composite_score": 88.4
    },
    {
      "id": "cz-virtual-cell-challenge",
      "name": "CZ Virtual Cell Challenge",
      "stages": [
        "virtual-cell",
        "target-id"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "perturbation-prediction"
      ],
      "description": "Open challenge to predict transcriptomic responses to genetic & chemical perturbations across cell types, hosted by CZ Biohub.",
      "size": {
        "cells": 2000000,
        "perturbations": 300
      },
      "primary_paper": {
        "title": "CZ Virtual Cell Challenge (launch)",
        "authors": [
          "CZI Science Team"
        ],
        "year": 2024,
        "doi": "N/A \u2014 consortium launch",
        "citations": 40
      },
      "official_url": "https://virtualcellchallenge.org/",
      "github_url": "https://github.com/czbiohub-sf",
      "leaderboard_url": "https://virtualcellchallenge.org/leaderboard",
      "license": "CC-BY",
      "first_release": "2024-11",
      "last_updated": "2025-09",
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 4,
        "quality": 5,
        "accessibility": 4,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Gold standard-in-the-making for foundation-model era perturbation prediction. Hidden test \u2192 strong against leakage.",
      "related_benchmarks": [
        "scperturb",
        "openproblems-perturbation",
        "perturbbench"
      ],
      "expert_ids": [
        "stephen-quake",
        "ambrose-carr"
      ],
      "group_ids": [
        "cz-biohub",
        "czi-science"
      ],
      "hosted_by": [
        "czi-virtual-cell"
      ],
      "composite_score": 88.1
    },
    {
      "id": "pli-gpcr-suite",
      "name": "ISM Benchmarks: GPCRs (Insilico)",
      "stages": [
        "hit-id",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule",
        "protein-general"
      ],
      "task_types": [
        "binding-affinity"
      ],
      "description": "87-benchmark GPCR affinity suite on ScienceAIBench / InsilicoBench / DDB \u2014 kinome-scale coverage of class A/B/C GPCRs.",
      "size": {
        "benchmarks": 87,
        "gpcr_targets": 87
      },
      "primary_paper": {
        "title": "GPCR affinity benchmark methodology",
        "authors": "Insilico AI Team",
        "year": 2025,
        "doi": "N/A \u2014 portal",
        "citations": 8
      },
      "official_url": "https://scienceaibench.insilico.com/",
      "github_url": "N/A",
      "leaderboard_url": "https://scienceaibench.insilico.com/",
      "license": "CC-BY",
      "first_release": "2025-02",
      "last_updated": "2026-04",
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Largest open GPCR affinity benchmark. Leaderboards test external frontier LLMs \u2014 not self-referential.",
      "related_benchmarks": [
        "pdbbind",
        "chembl"
      ],
      "expert_ids": [
        "alex-zhavoronkov",
        "alex-aliper",
        "alex-zhebrak"
      ],
      "group_ids": [
        "insilico-medicine"
      ],
      "hosted_by": [
        "insilico-scienceaibench",
        "insilico-insilicobench",
        "insilico-ddb"
      ],
      "composite_score": 87.6
    },
    {
      "id": "capri-benchmark",
      "name": "CAPRI Rounds",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "protein-general"
      ],
      "task_types": [
        "protein-protein-docking"
      ],
      "description": "56 rounds of blind protein-protein (and peptide, ligand) complex prediction.",
      "size": {
        "rounds": 56,
        "targets": 300
      },
      "primary_paper": {
        "title": "Modeling protein-protein and protein-peptide complexes: CAPRI 6th edition",
        "authors": [
          "Lensink MF",
          "Nadzirin N",
          "Velankar S",
          "Wodak SJ"
        ],
        "year": 2020,
        "doi": "10.1002/prot.25870",
        "citations": 200
      },
      "official_url": "https://www.ebi.ac.uk/pdbe/complex-pred/capri/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "Public",
      "first_release": "2001",
      "last_updated": "2024-11",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 4,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Oldest PPI prediction benchmark.",
      "related_benchmarks": [
        "pinder",
        "casp16"
      ],
      "expert_ids": [
        "marc-lensink",
        "shoshana-wodak"
      ],
      "group_ids": [
        "ebi",
        "ibm-brussels"
      ],
      "hosted_by": [
        "capri"
      ],
      "composite_score": 86.3
    },
    {
      "id": "toxcast",
      "name": "ToxCast",
      "stages": [
        "lead-id-admet",
        "ind-enabling"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "toxicity"
      ],
      "description": "EPA's in vitro toxicity screening dataset \u2014 ~700 assays \u00d7 ~9k chemicals.",
      "size": {
        "assays": 700,
        "compounds": 9000
      },
      "primary_paper": {
        "title": "The ToxCast Program for Prioritizing Toxicity Testing of Environmental Chemicals",
        "authors": [
          "Dix DJ",
          "Houck KA",
          "Martin MT",
          "et al."
        ],
        "year": 2007,
        "doi": "10.1093/toxsci/kfm297",
        "citations": 1900
      },
      "official_url": "https://www.epa.gov/comptox-tools/exploring-toxcast-data",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "Public",
      "first_release": "2007",
      "last_updated": "2024-06",
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 4,
        "adoption": 4,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Regulatory-grade broad tox dataset.",
      "related_benchmarks": [
        "tox21"
      ],
      "expert_ids": [
        "richard-judson"
      ],
      "group_ids": [
        "epa-ccte"
      ],
      "hosted_by": [
        "tdc"
      ],
      "composite_score": 85.6
    },
    {
      "id": "targetbench-insilico",
      "name": "TargetBench (Insilico)",
      "stages": [
        "target-id",
        "disease-modeling"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "target-prioritization"
      ],
      "description": "10 disease-area target identification benchmarks on ScienceAIBench/DDB (cancer, cardiovascular, endocrine/metabolic, fibrotic, inflammation/immunology, neuro, etc.).",
      "size": {
        "benchmarks": 10,
        "disease_areas": 10
      },
      "primary_paper": {
        "title": "TargetBench methodology",
        "authors": "Insilico AI Team",
        "year": 2025,
        "doi": "N/A \u2014 portal",
        "citations": 5
      },
      "official_url": "https://scienceaibench.insilico.com/",
      "github_url": "N/A",
      "leaderboard_url": "https://scienceaibench.insilico.com/",
      "license": "CC-BY",
      "first_release": "2025-03",
      "last_updated": "2026-04",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Disease-organized target ID benchmark \u2014 unique axis. Frontier LLM leaderboard.",
      "related_benchmarks": [
        "open-targets",
        "depmap"
      ],
      "expert_ids": [
        "alex-zhavoronkov",
        "alex-aliper"
      ],
      "group_ids": [
        "insilico-medicine"
      ],
      "hosted_by": [
        "insilico-scienceaibench",
        "insilico-ddb"
      ],
      "composite_score": 84.6
    },
    {
      "id": "ism-admet",
      "name": "ISM Benchmarks: ADMET (Insilico)",
      "stages": [
        "lead-id-admet",
        "ind-enabling"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "admet-regression",
        "admet-classification"
      ],
      "description": "28-endpoint ADMET benchmark suite on ScienceAIBench/InsilicoBench/DDB.",
      "size": {
        "benchmarks": 28
      },
      "primary_paper": {
        "title": "ISM ADMET benchmark methodology",
        "authors": "Insilico AI Team",
        "year": 2025,
        "doi": "N/A \u2014 portal",
        "citations": 7
      },
      "official_url": "https://scienceaibench.insilico.com/",
      "github_url": "N/A",
      "leaderboard_url": "https://scienceaibench.insilico.com/",
      "license": "CC-BY",
      "first_release": "2025-02",
      "last_updated": "2026-04",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Broader endpoint coverage than TDC ADMET. Side-by-side with TDC mirror on DDB.",
      "related_benchmarks": [
        "tdc-admet",
        "admet-ai"
      ],
      "expert_ids": [
        "alex-zhavoronkov",
        "alex-aliper"
      ],
      "group_ids": [
        "insilico-medicine"
      ],
      "hosted_by": [
        "insilico-scienceaibench",
        "insilico-insilicobench",
        "insilico-ddb"
      ],
      "composite_score": 84.6
    },
    {
      "id": "cafa5",
      "name": "CAFA5",
      "stages": [
        "target-id"
      ],
      "modalities": [
        "protein-general"
      ],
      "task_types": [
        "function-annotation"
      ],
      "description": "CAFA 5th edition \u2014 blind GO annotation eval (142k targets; Kaggle 2023, 1625 teams).",
      "size": {
        "targets": 142000,
        "go_terms": 43000
      },
      "primary_paper": {
        "title": "The CAFA challenge reports improved protein function prediction and new functional annotations for hundreds of genes through experimental screens",
        "authors": [
          "Zhou N",
          "Jiang Y",
          "Bergquist TR",
          "et al."
        ],
        "year": 2019,
        "doi": "10.1186/s13059-019-1835-8",
        "citations": 380
      },
      "official_url": "https://biofunctionprediction.org/cafa/",
      "github_url": "N/A",
      "leaderboard_url": "https://biofunctionprediction.org/cafa/results",
      "license": "Public",
      "first_release": "2022",
      "last_updated": "2023-12",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 3,
        "adoption": 5,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [],
      "notes": "CAFA5 broke attendance records.",
      "related_benchmarks": [
        "proteingym"
      ],
      "expert_ids": [
        "predrag-radivojac",
        "iddo-friedberg"
      ],
      "group_ids": [
        "cafa-consortium"
      ],
      "hosted_by": [
        "cafa"
      ],
      "composite_score": 84.3
    },
    {
      "id": "molecule-ace",
      "name": "MoleculeACE",
      "stages": [
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "regression",
        "activity-cliff"
      ],
      "description": "Benchmark testing model robustness on activity cliffs across 30 ChEMBL targets.",
      "size": {
        "targets": 30,
        "molecules": 48000
      },
      "primary_paper": {
        "title": "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs",
        "authors": [
          "van Tilborg D",
          "Alenicheva A",
          "Grisoni F"
        ],
        "year": 2022,
        "doi": "10.1021/acs.jcim.2c01073",
        "citations": 180
      },
      "official_url": "https://github.com/molML/MoleculeACE",
      "github_url": "https://github.com/molML/MoleculeACE",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2022-11",
      "last_updated": "2024-05",
      "rubric": {
        "rigor": 5,
        "coverage": 3,
        "maintenance": 3,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Critical stress-test for generalization; exposed GNN weaknesses.",
      "related_benchmarks": [
        "tdc-admet",
        "moleculenet"
      ],
      "expert_ids": [
        "francesca-grisoni",
        "derek-van-tilborg"
      ],
      "group_ids": [
        "tue-eindhoven"
      ],
      "hosted_by": [],
      "composite_score": 83.3
    },
    {
      "id": "matbench",
      "name": "MatBench",
      "stages": [
        "developmental-candidate"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "materials-property"
      ],
      "description": "Materials property prediction benchmark suite (used for some chemistry-adjacent ML).",
      "size": {
        "tasks": 13,
        "samples": 132000
      },
      "primary_paper": {
        "title": "Benchmarking materials property prediction methods: the Matbench test set and Automatminer reference algorithm",
        "authors": [
          "Dunn A",
          "Wang Q",
          "Ganose A",
          "Dopp D",
          "Jain A"
        ],
        "year": 2020,
        "doi": "10.1038/s41524-020-00406-3",
        "citations": 380
      },
      "official_url": "https://matbench.materialsproject.org/",
      "github_url": "https://github.com/materialsproject/matbench",
      "leaderboard_url": "https://matbench.materialsproject.org/Leaderboards%20Per-Task/",
      "license": "MIT",
      "first_release": "2020",
      "last_updated": "2025-02",
      "rubric": {
        "rigor": 5,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [],
      "notes": "Materials-science benchmark; relevant for formulation / co-crystal work.",
      "related_benchmarks": [],
      "expert_ids": [
        "anubhav-jain",
        "alex-dunn"
      ],
      "group_ids": [
        "materials-project",
        "lbl"
      ],
      "hosted_by": [
        "insilico-scienceaibench",
        "insilico-ddb"
      ],
      "composite_score": 83.3
    },
    {
      "id": "offsides-twosides",
      "name": "OffSides / TWOSIDES",
      "stages": [
        "post-market-rwe"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "adverse-event",
        "ddi"
      ],
      "description": "Statistically corrected single- and pair-wise drug adverse events derived from FAERS.",
      "size": {
        "offsides_signals": 438801,
        "twosides_combos": 870000
      },
      "primary_paper": {
        "title": "Data-driven prediction of drug effects and interactions",
        "authors": [
          "Tatonetti NP",
          "Ye PP",
          "Daneshjou R",
          "Altman RB"
        ],
        "year": 2012,
        "doi": "10.1126/scitranslmed.3003377",
        "citations": 1100
      },
      "official_url": "http://tatonettilab.org/offsides/",
      "github_url": "https://github.com/tatonetti-lab/nsides-release",
      "leaderboard_url": "N/A",
      "license": "CC-BY 4.0",
      "first_release": "2012",
      "last_updated": "2023-09",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 3,
        "adoption": 5,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Key benchmark for DDI + adverse event ML.",
      "related_benchmarks": [
        "sider",
        "faers-bench"
      ],
      "expert_ids": [
        "nick-tatonetti",
        "russ-altman"
      ],
      "group_ids": [
        "tatonetti-lab"
      ],
      "hosted_by": [
        "faers",
        "tdc"
      ],
      "composite_score": 83.0
    },
    {
      "id": "clinbench-quarterly",
      "name": "ClinBench Quarterly (Insilico)",
      "stages": [
        "phase-2",
        "phase-3",
        "clinical-development"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "trial-outcome-prediction"
      ],
      "description": "Quarterly-refreshed clinical-trial outcome benchmark on ScienceAIBench / InsilicoBench / DDB (25 tasks).",
      "size": {
        "tasks": 25,
        "refresh_cadence_months": 3
      },
      "primary_paper": {
        "title": "ClinBench methodology note",
        "authors": "Insilico AI Team",
        "year": 2025,
        "doi": "N/A \u2014 portal",
        "citations": 10
      },
      "official_url": "https://scienceaibench.insilico.com/",
      "github_url": "N/A",
      "leaderboard_url": "https://scienceaibench.insilico.com/",
      "license": "CC-BY",
      "first_release": "2025-01",
      "last_updated": "2026-04",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 5,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Benchmark refresh cadence beats all academic trial outcome benchmarks. Leaderboards test frontier LLMs against quarterly-updated splits.",
      "related_benchmarks": [
        "hint-trialbench",
        "top-benchmark"
      ],
      "expert_ids": [
        "alex-zhavoronkov",
        "alex-aliper"
      ],
      "group_ids": [
        "insilico-medicine"
      ],
      "hosted_by": [
        "insilico-scienceaibench",
        "insilico-insilicobench",
        "insilico-ddb"
      ],
      "composite_score": 81.5
    },
    {
      "id": "dockstring",
      "name": "DOCKSTRING",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "docking"
      ],
      "description": "260k ligands docked to 58 targets with AutoDock Vina; reproducible docking benchmark.",
      "size": {
        "ligands": 260155,
        "targets": 58
      },
      "primary_paper": {
        "title": "DOCKSTRING: easy molecular docking yields better benchmarks for ligand design",
        "authors": [
          "Garc\u00eda-Orteg\u00f3n M",
          "Simm GNC",
          "Tripp AJ",
          "et al."
        ],
        "year": 2022,
        "doi": "10.1021/acs.jcim.1c01334",
        "citations": 240
      },
      "official_url": "https://dockstring.github.io/",
      "github_url": "https://github.com/dockstring/dockstring",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2022-02",
      "last_updated": "2024-07",
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 3,
        "adoption": 4,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [],
      "notes": "Vina scores are a proxy; not a replacement for wet assays.",
      "related_benchmarks": [
        "lit-pcba",
        "pdbbind",
        "casf-2016"
      ],
      "expert_ids": [
        "jose-miguel-hernandez-lobato"
      ],
      "group_ids": [
        "cambridge-ml"
      ],
      "hosted_by": [],
      "composite_score": 81.3
    },
    {
      "id": "disgenet",
      "name": "DisGeNET",
      "stages": [
        "disease-modeling",
        "target-id"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "gene-disease-association"
      ],
      "description": "Large collection of gene-disease associations (text-mining + curation + repositories).",
      "size": {
        "associations": 1134942,
        "genes": 21671,
        "diseases": 30170
      },
      "primary_paper": {
        "title": "The DisGeNET knowledge platform for disease genomics: 2019 update",
        "authors": [
          "Pi\u00f1ero J",
          "Ram\u00edrez-Anguita JM",
          "Sa\u00fcch-Pitarch J",
          "et al."
        ],
        "year": 2020,
        "doi": "10.1093/nar/gkz1021",
        "citations": 1400
      },
      "official_url": "https://www.disgenet.com/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "CC-BY-NC / commercial tier",
      "first_release": "2010",
      "last_updated": "2024-11",
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 4,
        "adoption": 5,
        "quality": 3,
        "accessibility": 3,
        "industry_relevance": 4
      },
      "flags": [
        "license-gated-commercial"
      ],
      "notes": "Commercial license required for industry. Text-mining noise limits quality.",
      "related_benchmarks": [
        "open-targets",
        "primekg"
      ],
      "expert_ids": [
        "laura-furlong"
      ],
      "group_ids": [
        "medbioinformatics"
      ],
      "hosted_by": [
        "tdc"
      ],
      "composite_score": 81.0
    },
    {
      "id": "lit-pcba",
      "name": "LIT-PCBA",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "virtual-screening"
      ],
      "description": "Literature-curated dose-response PubChem BioAssay set; 15 targets with unbiased actives/inactives.",
      "size": {
        "targets": 15,
        "actives": 7844,
        "inactives": 2533936
      },
      "primary_paper": {
        "title": "LIT-PCBA: An Unbiased Data Set for Machine Learning and Virtual Screening",
        "authors": [
          "Tran-Nguyen VK",
          "Jacquemard C",
          "Rognan D"
        ],
        "year": 2020,
        "doi": "10.1021/acs.jcim.0c00155",
        "citations": 310
      },
      "official_url": "https://drugdesign.unistra.fr/LIT-PCBA/",
      "github_url": "https://github.com/ViktorTran-Nguyen/LIT-PCBA",
      "leaderboard_url": "N/A",
      "license": "CC-BY",
      "first_release": "2020-03",
      "last_updated": "2023-06",
      "rubric": {
        "rigor": 5,
        "coverage": 3,
        "maintenance": 2,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Much fairer than DUD-E; small target count limits coverage.",
      "related_benchmarks": [
        "dude",
        "dekois",
        "dockstring"
      ],
      "expert_ids": [
        "didier-rognan"
      ],
      "group_ids": [
        "strasbourg-chemo"
      ],
      "hosted_by": [],
      "composite_score": 80.8
    },
    {
      "id": "flip",
      "name": "FLIP",
      "stages": [
        "target-id",
        "developmental-candidate"
      ],
      "modalities": [
        "protein-general"
      ],
      "task_types": [
        "fitness-prediction"
      ],
      "description": "Fitness landscape inference benchmarks with realistic train/test splits (AAV, GB1, Meltome, SCL, Bind).",
      "size": {
        "landscapes": 5,
        "splits": 15
      },
      "primary_paper": {
        "title": "FLIP: Benchmark tasks in fitness landscape inference for proteins",
        "authors": [
          "Dallago C",
          "Mou J",
          "Johnston KE",
          "et al."
        ],
        "year": 2021,
        "doi": "10.48550/arXiv.2112.06661",
        "citations": 120
      },
      "official_url": "https://benchmark.protein.properties/",
      "github_url": "https://github.com/J-SNACKKB/FLIP",
      "leaderboard_url": "N/A",
      "license": "CC-BY 4.0",
      "first_release": "2021-12",
      "last_updated": "2024-05",
      "rubric": {
        "rigor": 5,
        "coverage": 3,
        "maintenance": 3,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [],
      "notes": "Complements ProteinGym (smaller but carefully designed splits).",
      "related_benchmarks": [
        "proteingym"
      ],
      "expert_ids": [
        "burkhard-rost",
        "mohammed-alquraishi",
        "christian-dallago"
      ],
      "group_ids": [
        "rostlab-tum",
        "alquraishi-lab"
      ],
      "hosted_by": [
        "flip"
      ],
      "composite_score": 80.8
    },
    {
      "id": "cptac-proteogenomic",
      "name": "CPTAC Proteogenomic Benchmarks",
      "stages": [
        "disease-modeling",
        "target-id",
        "phase-2"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "biomarker-discovery"
      ],
      "description": "Proteogenomic benchmarks across 10 tumor types \u2014 drives DREAM proteogenomic challenges.",
      "size": {
        "samples": 1600,
        "tumor_types": 10,
        "omics_layers": 6
      },
      "primary_paper": {
        "title": "Proteogenomic Characterization of Cancer Types (CPTAC overview)",
        "authors": [
          "Zhang B",
          "Wang J",
          "Wang X",
          "et al."
        ],
        "year": 2014,
        "doi": "10.1038/nature13438",
        "citations": 1200
      },
      "official_url": "https://proteomics.cancer.gov/programs/cptac",
      "github_url": "https://github.com/PayneLab/cptac",
      "leaderboard_url": "N/A",
      "license": "dbGaP / public",
      "first_release": "2011",
      "last_updated": "2025-03",
      "rubric": {
        "rigor": 5,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 3,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Deep integrative oncology data.",
      "related_benchmarks": [
        "depmap"
      ],
      "expert_ids": [
        "henry-rodriguez",
        "amanda-paulovich",
        "bing-zhang"
      ],
      "group_ids": [
        "nci-cptac"
      ],
      "hosted_by": [
        "cptac"
      ],
      "composite_score": 80.8
    },
    {
      "id": "guacamol",
      "name": "GuacaMol",
      "stages": [
        "lead-id-admet",
        "developmental-candidate"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "molecule-generation"
      ],
      "description": "Goal-directed + distribution-learning benchmarks for molecular generative models.",
      "size": {
        "tasks": 20,
        "train_set": 1600000
      },
      "primary_paper": {
        "title": "GuacaMol: Benchmarking Models for de Novo Molecular Design",
        "authors": [
          "Brown N",
          "Fiscato M",
          "Segler MHS",
          "Vaucher AC"
        ],
        "year": 2019,
        "doi": "10.1021/acs.jcim.8b00839",
        "citations": 820
      },
      "official_url": "https://www.benevolent.com/guacamol",
      "github_url": "https://github.com/BenevolentAI/guacamol",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2019-03",
      "last_updated": "2022-07",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 2,
        "adoption": 5,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "First-generation generative benchmark; largely superseded by PMO for goal-directed.",
      "related_benchmarks": [
        "moses",
        "pmo"
      ],
      "expert_ids": [
        "marwin-segler",
        "nathan-brown"
      ],
      "group_ids": [
        "benevolent-ai"
      ],
      "hosted_by": [
        "moleculenet",
        "tdc"
      ],
      "composite_score": 80.5
    },
    {
      "id": "pksim",
      "name": "Open Systems Pharmacology / PK-Sim",
      "stages": [
        "phase-1",
        "ind-enabling"
      ],
      "modalities": [
        "small-molecule",
        "biologic-mab"
      ],
      "task_types": [
        "pbpk-validation"
      ],
      "description": "OSP Suite \u2014 open PBPK/QSP models and validation sets.",
      "size": {
        "models": 100,
        "validated_compounds": 50
      },
      "primary_paper": {
        "title": "The Open Systems Pharmacology Suite: a new era in PBPK modeling",
        "authors": [
          "Lippert J",
          "Burghaus R",
          "Edginton A",
          "et al."
        ],
        "year": 2019,
        "doi": "10.1002/psp4.12386",
        "citations": 180
      },
      "official_url": "https://www.open-systems-pharmacology.org/",
      "github_url": "https://github.com/Open-Systems-Pharmacology",
      "leaderboard_url": "N/A",
      "license": "GPL-2.0",
      "first_release": "2017",
      "last_updated": "2025-01",
      "rubric": {
        "rigor": 5,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Open alternative to Simcyp.",
      "related_benchmarks": [
        "simcyp-validation"
      ],
      "expert_ids": [
        "andrea-edginton",
        "joerg-lippert"
      ],
      "group_ids": [
        "osp-consortium"
      ],
      "hosted_by": [],
      "composite_score": 80.3
    },
    {
      "id": "admet-ai",
      "name": "ADMET-AI",
      "stages": [
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "regression",
        "classification"
      ],
      "description": "Graph-based ADMET prediction + benchmark on 41 endpoints leveraging ChEMBL + TDC.",
      "size": {
        "endpoints": 41,
        "molecules": 90000
      },
      "primary_paper": {
        "title": "ADMET-AI: a machine learning ADMET platform for evaluation of large-scale chemical libraries",
        "authors": [
          "Swanson K",
          "Walther P",
          "Leitz J",
          "et al."
        ],
        "year": 2023,
        "doi": "10.1093/bioinformatics/btae416",
        "citations": 85
      },
      "official_url": "https://admet.ai.greenstonebio.com/",
      "github_url": "https://github.com/swansonk14/admet_ai",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2023-11",
      "last_updated": "2024-12",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 4,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Strong baselines + web tool; builds on TDC.",
      "related_benchmarks": [
        "tdc-admet",
        "moleculenet"
      ],
      "expert_ids": [
        "kyle-swanson",
        "regina-barzilay"
      ],
      "group_ids": [
        "mit-csail",
        "barzilay-lab"
      ],
      "hosted_by": [],
      "composite_score": 79.5
    },
    {
      "id": "ames",
      "name": "AMES (mutagenicity)",
      "stages": [
        "ind-enabling",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "toxicity-classification"
      ],
      "description": "AMES bacterial mutagenicity benchmark \u2014 standard gentox endpoint.",
      "size": {
        "molecules": 7278
      },
      "primary_paper": {
        "title": "Therapeutics Data Commons",
        "authors": [
          "Huang K",
          "Fu T",
          "et al."
        ],
        "year": 2021,
        "doi": "10.48550/arXiv.2102.09548",
        "citations": 620
      },
      "official_url": "https://tdcommons.ai/single_pred_tasks/tox/#ames",
      "github_url": "https://github.com/mims-harvard/TDC",
      "leaderboard_url": "https://tdcommons.ai/benchmark/admet_group/22ames/",
      "license": "MIT",
      "first_release": "2021-02",
      "last_updated": "2025-01",
      "rubric": {
        "rigor": 4,
        "coverage": 2,
        "maintenance": 4,
        "adoption": 5,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Core gentox endpoint.",
      "related_benchmarks": [
        "tdc-admet",
        "tox21"
      ],
      "expert_ids": [
        "kexin-huang"
      ],
      "group_ids": [
        "tdc"
      ],
      "hosted_by": [
        "tdc",
        "insilico-ddb"
      ],
      "composite_score": 79.5
    },
    {
      "id": "polaris-biologics",
      "name": "Polaris Biologics (Polyreactivity / SEC / Tm)",
      "stages": [
        "developmental-candidate"
      ],
      "modalities": [
        "biologic-mab"
      ],
      "task_types": [
        "developability"
      ],
      "description": "Polaris Hub biologics benchmarks: polyreactivity, SEC-SMAC, Tm + titer.",
      "size": {
        "tasks": 6,
        "antibodies": 2000
      },
      "primary_paper": {
        "title": "Polaris biologics method comparison",
        "authors": "Wognum C et al.",
        "year": 2024,
        "doi": "N/A",
        "citations": 15
      },
      "official_url": "https://polarishub.io/benchmarks",
      "github_url": "https://github.com/polaris-hub/polaris",
      "leaderboard_url": "https://polarishub.io/benchmarks",
      "license": "Polaris Community",
      "first_release": "2024-04",
      "last_updated": "2025-03",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 5,
        "adoption": 3,
        "quality": 4,
        "accessibility": 4,
        "industry_relevance": 5
      },
      "flags": [],
      "notes": "Industry-donated; growing.",
      "related_benchmarks": [
        "polaris-admet"
      ],
      "expert_ids": [
        "cas-wognum"
      ],
      "group_ids": [
        "valence-labs",
        "recursion"
      ],
      "hosted_by": [
        "polaris",
        "insilico-scienceaibench",
        "insilico-ddb"
      ],
      "composite_score": 79.0
    },
    {
      "id": "moleculenet",
      "name": "MoleculeNet",
      "stages": [
        "lead-id-admet",
        "hit-id"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "regression",
        "classification",
        "quantum"
      ],
      "description": "Multi-task molecular ML benchmark covering quantum, physical, biophysical, physiological properties.",
      "size": {
        "tasks": 17,
        "molecules": 700000
      },
      "primary_paper": {
        "title": "MoleculeNet: A Benchmark for Molecular Machine Learning",
        "authors": [
          "Wu Z",
          "Ramsundar B",
          "Feinberg EN",
          "et al."
        ],
        "year": 2018,
        "doi": "10.1039/C7SC02664A",
        "citations": 3600
      },
      "official_url": "https://moleculenet.org/",
      "github_url": "https://github.com/deepchem/deepchem",
      "leaderboard_url": "https://moleculenet.org/full-results",
      "license": "MIT",
      "first_release": "2018-03",
      "last_updated": "2023-11",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 2,
        "adoption": 5,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [
        "data-leakage-known"
      ],
      "notes": "Widely cited (3600+); aging splits with known scaffold leakage.",
      "related_benchmarks": [
        "tdc-admet",
        "polaris-admet"
      ],
      "expert_ids": [
        "bharath-ramsundar",
        "vijay-pande"
      ],
      "group_ids": [
        "pande-lab",
        "deepchem"
      ],
      "hosted_by": [
        "moleculenet",
        "deepchem",
        "papers-with-code-drug"
      ],
      "composite_score": 78.0
    },
    {
      "id": "uspto-retrosyn",
      "name": "USPTO-50K / USPTO-MIT (Retrosynthesis)",
      "stages": [
        "lead-id-admet",
        "developmental-candidate"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "retrosynthesis",
        "reaction-prediction"
      ],
      "description": "Reactions extracted from USPTO patents; standard retrosynthesis/forward-reaction benchmark.",
      "size": {
        "reactions": 1800000,
        "canonical_50k": 50037
      },
      "primary_paper": {
        "title": "Neural Sequence-to-Sequence Models for Retrosynthesis Prediction",
        "authors": [
          "Liu B",
          "Ramsundar B",
          "Kawthekar P",
          "et al."
        ],
        "year": 2017,
        "doi": "10.1021/acscentsci.7b00303",
        "citations": 520
      },
      "official_url": "https://github.com/Hanjun-Dai/GLN",
      "github_url": "https://github.com/Hanjun-Dai/GLN",
      "leaderboard_url": "N/A",
      "license": "Public",
      "first_release": "2017",
      "last_updated": "2023",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 2,
        "adoption": 5,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [
        "data-leakage-known"
      ],
      "notes": "Known leakage across canonical splits; use time-split or ORD for fairer eval.",
      "related_benchmarks": [
        "open-reaction-database",
        "chemrxiv-reactions"
      ],
      "expert_ids": [
        "bharath-ramsundar",
        "connor-coley"
      ],
      "group_ids": [
        "mit-csail",
        "coley-lab"
      ],
      "hosted_by": [
        "tdc",
        "papers-with-code-drug"
      ],
      "composite_score": 78.0
    },
    {
      "id": "tox21",
      "name": "Tox21",
      "stages": [
        "lead-id-admet",
        "ind-enabling"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "toxicity-classification"
      ],
      "description": "US Tox21 program HTS data on 12 nuclear receptor / stress response assays.",
      "size": {
        "compounds": 10000,
        "assays": 12
      },
      "primary_paper": {
        "title": "Tox21 Challenge to Build Predictive Models of Nuclear Receptor and Stress Response Pathways",
        "authors": [
          "Huang R",
          "Xia M",
          "et al."
        ],
        "year": 2016,
        "doi": "10.3389/fenvs.2015.00085",
        "citations": 1100
      },
      "official_url": "https://tripod.nih.gov/tox21/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "Public",
      "first_release": "2014",
      "last_updated": "2017",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 2,
        "adoption": 5,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Field-standard tox benchmark; endpoint count small vs modern suites.",
      "related_benchmarks": [
        "toxcast",
        "moleculenet"
      ],
      "expert_ids": [
        "ruili-huang"
      ],
      "group_ids": [
        "ncats"
      ],
      "hosted_by": [
        "moleculenet",
        "tdc"
      ],
      "composite_score": 77.5
    },
    {
      "id": "iglm-bench",
      "name": "IgLM / AntiBERTa benchmarks",
      "stages": [
        "hit-id",
        "developmental-candidate"
      ],
      "modalities": [
        "biologic-mab"
      ],
      "task_types": [
        "antibody-generation",
        "liability-prediction"
      ],
      "description": "Antibody LM eval \u2014 paratope prediction, CDR generation, developability.",
      "size": {
        "sequences": 600000000,
        "tasks": 6
      },
      "primary_paper": {
        "title": "Generative language models for antibody design",
        "authors": [
          "Shuai RW",
          "Ruffolo JA",
          "Gray JJ"
        ],
        "year": 2023,
        "doi": "10.1016/j.cels.2023.07.001",
        "citations": 140
      },
      "official_url": "https://github.com/Graylab/IgLM",
      "github_url": "https://github.com/Graylab/IgLM",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2022",
      "last_updated": "2024-08",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 3,
        "adoption": 4,
        "quality": 4,
        "accessibility": 4,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Moves toward true developability benchmarks.",
      "related_benchmarks": [
        "oas",
        "sabdab"
      ],
      "expert_ids": [
        "jeffrey-gray",
        "richard-shuai"
      ],
      "group_ids": [
        "jhu-gray-lab"
      ],
      "hosted_by": [],
      "composite_score": 77.5
    },
    {
      "id": "geneformer-bench",
      "name": "Geneformer Eval",
      "stages": [
        "virtual-cell"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "cell-type-annotation",
        "gene-dosage"
      ],
      "description": "Evaluation of Geneformer (30M transformer pretrained on single-cell corpus) on zero-shot & few-shot downstream tasks.",
      "size": {
        "cells": 30000000,
        "tasks": 7
      },
      "primary_paper": {
        "title": "Transfer learning enables predictions in network biology",
        "authors": [
          "Theodoris CV",
          "Xiao L",
          "Chopra A",
          "et al."
        ],
        "year": 2023,
        "doi": "10.1038/s41586-023-06139-9",
        "citations": 550
      },
      "official_url": "https://huggingface.co/ctheodoris/Geneformer",
      "github_url": "https://huggingface.co/ctheodoris/Geneformer",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2023-06",
      "last_updated": "2024-11",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 4,
        "quality": 4,
        "accessibility": 4,
        "industry_relevance": 4
      },
      "flags": [
        "self_referential"
      ],
      "notes": "Author-led eval; still widely re-run on OpenProblems tasks.",
      "related_benchmarks": [
        "openproblems-perturbation",
        "scgpt-bench"
      ],
      "expert_ids": [
        "christina-theodoris"
      ],
      "group_ids": [
        "broad-institute"
      ],
      "hosted_by": [],
      "composite_score": 77.0
    },
    {
      "id": "tdc-drug-syn",
      "name": "TDC DrugSyn (OncoPolyPharm + DrugComb_NCI60)",
      "stages": [
        "developmental-candidate",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "drug-synergy"
      ],
      "description": "Drug combination synergy benchmark (NCI ALMANAC + OncoPolyPharmacology).",
      "size": {
        "combos": 23000,
        "cell_lines": 60
      },
      "primary_paper": {
        "title": "Therapeutics Data Commons",
        "authors": [
          "Huang K",
          "Fu T",
          "et al."
        ],
        "year": 2021,
        "doi": "10.48550/arXiv.2102.09548",
        "citations": 620
      },
      "official_url": "https://tdcommons.ai/multi_pred_tasks/drugsyn/",
      "github_url": "https://github.com/mims-harvard/TDC",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2021-02",
      "last_updated": "2024-12",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 4,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [],
      "notes": "Important for combination therapy design.",
      "related_benchmarks": [
        "tdc-admet",
        "depmap"
      ],
      "expert_ids": [
        "kexin-huang",
        "marinka-zitnik"
      ],
      "group_ids": [
        "tdc",
        "zitnik-lab"
      ],
      "hosted_by": [
        "tdc"
      ],
      "composite_score": 77.0
    },
    {
      "id": "pkpd-obach",
      "name": "Obach PK Dataset",
      "stages": [
        "phase-1",
        "ind-enabling",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "pk-regression"
      ],
      "description": "Obach human PK dataset (t1/2, VDss, CL) \u2014 standard human-PK ML benchmark.",
      "size": {
        "compounds": 1338,
        "endpoints": 3
      },
      "primary_paper": {
        "title": "Trend analysis of a database of intravenous pharmacokinetic parameters in humans for 670 drug compounds",
        "authors": [
          "Obach RS",
          "Lombardo F",
          "Waters NJ"
        ],
        "year": 2008,
        "doi": "10.1124/dmd.108.020479",
        "citations": 820
      },
      "official_url": "https://tdcommons.ai/single_pred_tasks/adme/#half-life-obach-et-al",
      "github_url": "https://github.com/mims-harvard/TDC",
      "leaderboard_url": "https://tdcommons.ai/benchmark/admet_group/",
      "license": "MIT",
      "first_release": "2008",
      "last_updated": "2024-06",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 3,
        "adoption": 4,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Small but highest-quality human-PK dataset.",
      "related_benchmarks": [
        "tdc-admet"
      ],
      "expert_ids": [
        "scott-obach",
        "franco-lombardo"
      ],
      "group_ids": [
        "pfizer"
      ],
      "hosted_by": [
        "tdc",
        "insilico-ddb"
      ],
      "composite_score": 77.0
    },
    {
      "id": "hint-trialbench",
      "name": "HINT / TrialBench",
      "stages": [
        "phase-2",
        "phase-3",
        "clinical-development"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "trial-outcome-prediction"
      ],
      "description": "Clinical trial outcome prediction benchmarks built on ClinicalTrials.gov (17-21k trials).",
      "size": {
        "trials": 21000,
        "drugs": 12000,
        "diseases": 5000
      },
      "primary_paper": {
        "title": "Hierarchical Interaction Network for Clinical Trial Outcome Prediction",
        "authors": [
          "Fu T",
          "Huang K",
          "Xiao C",
          "Glass L",
          "Sun J"
        ],
        "year": 2022,
        "doi": "10.1016/j.patter.2022.100445",
        "citations": 200
      },
      "official_url": "https://github.com/futianfan/clinical-trial-outcome-prediction",
      "github_url": "https://github.com/futianfan/clinical-trial-outcome-prediction",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2022",
      "last_updated": "2024-07",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Limited by ClinicalTrials.gov quality.",
      "related_benchmarks": [
        "top-benchmark",
        "ctod"
      ],
      "expert_ids": [
        "tianfan-fu",
        "jimeng-sun",
        "marinka-zitnik"
      ],
      "group_ids": [
        "fu-lab",
        "sun-lab-gatech",
        "zitnik-lab"
      ],
      "hosted_by": [
        "trialbench",
        "tdc"
      ],
      "composite_score": 76.5
    },
    {
      "id": "top-benchmark",
      "name": "Trial Outcome Prediction (TOP)",
      "stages": [
        "phase-3",
        "clinical-development"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "trial-outcome-prediction"
      ],
      "description": "Benchmarks for predicting Phase 1-3 trial outcomes from structured + text features.",
      "size": {
        "trials": 17000,
        "phases": 3
      },
      "primary_paper": {
        "title": "Artificial intelligence for clinical trial design",
        "authors": [
          "Harrer S",
          "Shah P",
          "Antony B",
          "Hu J"
        ],
        "year": 2019,
        "doi": "10.1016/j.tips.2019.05.005",
        "citations": 520
      },
      "official_url": "https://github.com/futianfan/clinical-trial-outcome-prediction",
      "github_url": "https://github.com/futianfan/clinical-trial-outcome-prediction",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2022",
      "last_updated": "2024",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Often reported alongside HINT.",
      "related_benchmarks": [
        "hint-trialbench"
      ],
      "expert_ids": [
        "tianfan-fu",
        "jimeng-sun"
      ],
      "group_ids": [
        "fu-lab",
        "sun-lab-gatech"
      ],
      "hosted_by": [
        "trialbench"
      ],
      "composite_score": 76.5
    },
    {
      "id": "casf-2016",
      "name": "CASF-2016",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "small-molecule",
        "protein-general"
      ],
      "task_types": [
        "scoring-function-eval"
      ],
      "description": "Comparative Assessment of Scoring Functions \u2014 scoring, ranking, docking, screening power tests.",
      "size": {
        "complexes": 285,
        "decoy_poses": 28500
      },
      "primary_paper": {
        "title": "Comparative Assessment of Scoring Functions: The CASF-2016 Update",
        "authors": [
          "Su M",
          "Yang Q",
          "Du Y",
          "et al."
        ],
        "year": 2019,
        "doi": "10.1021/acs.jcim.8b00545",
        "citations": 430
      },
      "official_url": "http://www.pdbbind.org.cn/casf.php",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "Academic-only",
      "first_release": "2016",
      "last_updated": "2019",
      "rubric": {
        "rigor": 5,
        "coverage": 3,
        "maintenance": 2,
        "adoption": 5,
        "quality": 4,
        "accessibility": 3,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Authoritative scoring-power eval; update cadence slow.",
      "related_benchmarks": [
        "pdbbind",
        "posebusters",
        "plinder"
      ],
      "expert_ids": [
        "renxiao-wang"
      ],
      "group_ids": [
        "simm-shanghai"
      ],
      "hosted_by": [
        "pdbbind-casf"
      ],
      "composite_score": 76.2
    },
    {
      "id": "pdbbind",
      "name": "PDBbind",
      "stages": [
        "hit-id",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule",
        "protein-general"
      ],
      "task_types": [
        "binding-affinity"
      ],
      "description": "Curated PDB complexes with experimental binding affinities \u2014 de facto standard for ML affinity prediction.",
      "size": {
        "complexes": 23500,
        "general_set": 19443,
        "refined_set": 5316
      },
      "primary_paper": {
        "title": "The PDBbind Database: Methodologies and Updates",
        "authors": [
          "Wang R",
          "Fang X",
          "Lu Y",
          "Wang S"
        ],
        "year": 2005,
        "doi": "10.1021/jm048957q",
        "citations": 2100
      },
      "official_url": "http://www.pdbbind.org.cn/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "Academic-only",
      "first_release": "2004",
      "last_updated": "2022-01",
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 2,
        "adoption": 5,
        "quality": 3,
        "accessibility": 3,
        "industry_relevance": 4
      },
      "flags": [
        "data-leakage-known"
      ],
      "notes": "Scaffold/temporal leakage well-documented. Pair with CASF + LeakyPDB.",
      "related_benchmarks": [
        "casf-2016",
        "plinder",
        "posebusters"
      ],
      "expert_ids": [
        "renxiao-wang"
      ],
      "group_ids": [
        "simm-shanghai"
      ],
      "hosted_by": [
        "pdbbind-casf"
      ],
      "composite_score": 75.9
    },
    {
      "id": "sider",
      "name": "SIDER",
      "stages": [
        "post-market-rwe",
        "ind-enabling"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "adverse-event"
      ],
      "description": "Drug-side effect associations mined from FDA labels.",
      "size": {
        "drugs": 1430,
        "side_effect_pairs": 139000
      },
      "primary_paper": {
        "title": "The SIDER database of drugs and side effects",
        "authors": [
          "Kuhn M",
          "Letunic I",
          "Jensen LJ",
          "Bork P"
        ],
        "year": 2016,
        "doi": "10.1093/nar/gkv1075",
        "citations": 1700
      },
      "official_url": "http://sideeffects.embl.de/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "CC-BY-NC-SA 4.0",
      "first_release": "2010",
      "last_updated": "2016",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 2,
        "adoption": 5,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Aging but still widely used. TWOSIDES/OffSides offer newer signals.",
      "related_benchmarks": [
        "offsides-twosides",
        "faers-bench"
      ],
      "expert_ids": [
        "michael-kuhn",
        "peer-bork"
      ],
      "group_ids": [
        "embl-bork"
      ],
      "hosted_by": [
        "faers"
      ],
      "composite_score": 74.9
    },
    {
      "id": "tape",
      "name": "TAPE",
      "stages": [
        "target-id",
        "developmental-candidate"
      ],
      "modalities": [
        "protein-general"
      ],
      "task_types": [
        "protein-ml"
      ],
      "description": "Tasks Assessing Protein Embeddings \u2014 5 tasks (secondary structure, contact, fluorescence, stability, homology).",
      "size": {
        "tasks": 5
      },
      "primary_paper": {
        "title": "Evaluating Protein Transfer Learning with TAPE",
        "authors": [
          "Rao R",
          "Bhattacharya N",
          "Thomas N",
          "Dai Y",
          "Liu P",
          "Canny J",
          "Abbeel P",
          "Song YS"
        ],
        "year": 2019,
        "doi": "10.48550/arXiv.1906.08230",
        "citations": 950
      },
      "official_url": "https://github.com/songlab-cal/tape",
      "github_url": "https://github.com/songlab-cal/tape",
      "leaderboard_url": "N/A",
      "license": "BSD-3",
      "first_release": "2019",
      "last_updated": "2022",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 2,
        "adoption": 5,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [
        "deprecated-recommend-replace"
      ],
      "notes": "Historically important; largely superseded by ProteinGym/FLIP for fitness and by PEER for broader tasks.",
      "related_benchmarks": [
        "proteingym",
        "peer"
      ],
      "expert_ids": [
        "roshan-rao",
        "pieter-abbeel",
        "yun-song"
      ],
      "group_ids": [
        "uc-berkeley"
      ],
      "hosted_by": [
        "papers-with-code-drug"
      ],
      "composite_score": 74.9
    },
    {
      "id": "simcyp-validation",
      "name": "Simcyp Validation Sets",
      "stages": [
        "phase-1",
        "phase-2",
        "ind-enabling"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "pbpk-validation"
      ],
      "description": "PBPK validation datasets used by Simcyp/Certara community (DDI, pediatric, renal impairment).",
      "size": {
        "scenarios": 100
      },
      "primary_paper": {
        "title": "Physiologically based pharmacokinetic modeling: Methods and applications in pharmacotherapy",
        "authors": [
          "Rostami-Hodjegan A",
          "Tucker GT"
        ],
        "year": 2007,
        "doi": "10.1038/nrd2173",
        "citations": 780
      },
      "official_url": "https://www.certara.com/software/simcyp/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "Proprietary",
      "first_release": "2001",
      "last_updated": "2024",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 4,
        "quality": 4,
        "accessibility": 2,
        "industry_relevance": 5
      },
      "flags": [
        "license-gated-commercial"
      ],
      "notes": "Industry gold standard but proprietary. Open benchmarks exist via OSP Suite.",
      "related_benchmarks": [
        "pksim"
      ],
      "expert_ids": [
        "amin-rostami-hodjegan"
      ],
      "group_ids": [
        "certara"
      ],
      "hosted_by": [],
      "composite_score": 74.4
    },
    {
      "id": "peer",
      "name": "PEER",
      "stages": [
        "target-id",
        "developmental-candidate"
      ],
      "modalities": [
        "protein-general"
      ],
      "task_types": [
        "protein-ml"
      ],
      "description": "PEER \u2014 14 protein property prediction tasks across structure/function/interaction.",
      "size": {
        "tasks": 14
      },
      "primary_paper": {
        "title": "PEER: A Comprehensive and Multi-Task Benchmark for Protein Sequence Understanding",
        "authors": [
          "Xu M",
          "Yuan X",
          "Miret S",
          "Tang J"
        ],
        "year": 2022,
        "doi": "10.48550/arXiv.2206.02096",
        "citations": 130
      },
      "official_url": "https://github.com/DeepGraphLearning/PEER_Benchmark",
      "github_url": "https://github.com/DeepGraphLearning/PEER_Benchmark",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2022",
      "last_updated": "2024",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 3,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [],
      "notes": "Broader than TAPE, tighter than ProteinGym; good middle ground.",
      "related_benchmarks": [
        "tape",
        "proteingym",
        "flip"
      ],
      "expert_ids": [
        "jian-tang",
        "minghao-xu"
      ],
      "group_ids": [
        "mila-quebec"
      ],
      "hosted_by": [
        "papers-with-code-drug"
      ],
      "composite_score": 74.4
    },
    {
      "id": "clawbio-bench",
      "name": "ClawBio Skill Correctness Bench",
      "stages": [
        "disease-modeling",
        "target-id",
        "clinical-development"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "correctness-audit",
        "safety-audit"
      ],
      "description": "Third-party (Biostochastics LLC) benchmark of bio-analysis skills on safety / correctness / honesty. 10 skills \u00d7 182 tests.",
      "size": {
        "skills": 10,
        "tests": 182,
        "pass_rate_pct": 92.3
      },
      "primary_paper": {
        "title": "clawbio_bench README (v0.1.5)",
        "authors": [
          "Biostochastics LLC"
        ],
        "year": 2026,
        "doi": "N/A \u2014 repo",
        "citations": 5
      },
      "official_url": "https://clawbio.ai/benchmarks.html",
      "github_url": "https://github.com/biostochastics/clawbio_bench",
      "leaderboard_url": "https://clawbio.ai/benchmarks.html",
      "license": "MIT",
      "first_release": "2026-04",
      "last_updated": "2026-05-03",
      "rubric": {
        "rigor": 5,
        "coverage": 2,
        "maintenance": 5,
        "adoption": 2,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [],
      "notes": "Independent third-party bench structurally precludes self-reference. Coverage narrow but rigor exemplary.",
      "related_benchmarks": [],
      "expert_ids": [],
      "group_ids": [
        "clawbio",
        "biostochastics"
      ],
      "hosted_by": [
        "clawbio"
      ],
      "composite_score": 74.2
    },
    {
      "id": "herg-classifier-bench",
      "name": "hERG (cardio-tox) TDC",
      "stages": [
        "ind-enabling",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "toxicity-classification"
      ],
      "description": "Cardiac tox benchmark (hERG inhibition) \u2014 standardized from Wang et al.",
      "size": {
        "molecules": 655,
        "assays": 1
      },
      "primary_paper": {
        "title": "Therapeutics Data Commons",
        "authors": [
          "Huang K",
          "Fu T",
          "et al."
        ],
        "year": 2021,
        "doi": "10.48550/arXiv.2102.09548",
        "citations": 620
      },
      "official_url": "https://tdcommons.ai/single_pred_tasks/tox/#herg",
      "github_url": "https://github.com/mims-harvard/TDC",
      "leaderboard_url": "https://tdcommons.ai/benchmark/admet_group/24herg/",
      "license": "MIT",
      "first_release": "2021-02",
      "last_updated": "2025-01",
      "rubric": {
        "rigor": 4,
        "coverage": 2,
        "maintenance": 4,
        "adoption": 4,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Small but widely benchmarked. Industry pairs with SafetyPanel-5.",
      "related_benchmarks": [
        "tdc-admet",
        "ames"
      ],
      "expert_ids": [
        "kexin-huang"
      ],
      "group_ids": [
        "tdc"
      ],
      "hosted_by": [
        "tdc",
        "insilico-scienceaibench",
        "insilico-ddb"
      ],
      "composite_score": 73.9
    },
    {
      "id": "dili-ldi",
      "name": "DILI / LD50 Zhu",
      "stages": [
        "ind-enabling",
        "lead-id-admet"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "toxicity-regression"
      ],
      "description": "Drug-induced liver injury + rat LD50 (Zhu) \u2014 standard acute tox benchmarks.",
      "size": {
        "dili_molecules": 475,
        "ld50_molecules": 7385
      },
      "primary_paper": {
        "title": "Quantitative structure-activity relationship modeling of rat acute toxicity",
        "authors": [
          "Zhu H",
          "Martin TM",
          "Ye L",
          "et al."
        ],
        "year": 2009,
        "doi": "10.1021/tx900189p",
        "citations": 460
      },
      "official_url": "https://tdcommons.ai/single_pred_tasks/tox/#dili",
      "github_url": "https://github.com/mims-harvard/TDC",
      "leaderboard_url": "https://tdcommons.ai/benchmark/admet_group/",
      "license": "MIT",
      "first_release": "2021-02",
      "last_updated": "2024-12",
      "rubric": {
        "rigor": 4,
        "coverage": 2,
        "maintenance": 4,
        "adoption": 4,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Essential IND-enabling endpoints.",
      "related_benchmarks": [
        "tdc-admet",
        "tox21",
        "toxcast"
      ],
      "expert_ids": [
        "kexin-huang"
      ],
      "group_ids": [
        "tdc"
      ],
      "hosted_by": [
        "tdc",
        "insilico-ddb"
      ],
      "composite_score": 73.9
    },
    {
      "id": "scgpt-bench",
      "name": "scGPT Evaluation Suite",
      "stages": [
        "virtual-cell"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "cell-type-annotation",
        "perturbation-prediction"
      ],
      "description": "Evaluation shipped with scGPT foundation model (cell type annotation, GRN, perturbation).",
      "size": {
        "cells": 33000000,
        "tasks": 5
      },
      "primary_paper": {
        "title": "scGPT: toward building a foundation model for single-cell multi-omics using generative AI",
        "authors": [
          "Cui H",
          "Wang C",
          "Maan H",
          "Pang K",
          "Luo F",
          "Wang B"
        ],
        "year": 2024,
        "doi": "10.1038/s41592-024-02201-0",
        "citations": 520
      },
      "official_url": "https://github.com/bowang-lab/scGPT",
      "github_url": "https://github.com/bowang-lab/scGPT",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2023-05",
      "last_updated": "2025-01",
      "rubric": {
        "rigor": 3,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 5,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [
        "self_referential"
      ],
      "notes": "Evaluation dominated by authors' own model \u2014 flagged self-referential. Pair with OpenProblems for fair comparison.",
      "related_benchmarks": [
        "openproblems-perturbation"
      ],
      "expert_ids": [
        "bo-wang"
      ],
      "group_ids": [
        "wang-lab-toronto"
      ],
      "hosted_by": [],
      "composite_score": 73.7
    },
    {
      "id": "ctod",
      "name": "CT-Outcome (TrialBench v2)",
      "stages": [
        "phase-2",
        "phase-3"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "trial-outcome"
      ],
      "description": "Updated 2024 trial outcome benchmark with temporal splits.",
      "size": {
        "trials": 25000
      },
      "primary_paper": {
        "title": "TrialBench: Multi-Modal Artificial Intelligence-Ready Clinical Trial Datasets",
        "authors": [
          "Chen J",
          "Hu Y",
          "Wang Y",
          "et al."
        ],
        "year": 2024,
        "doi": "10.48550/arXiv.2407.00631",
        "citations": 25
      },
      "official_url": "https://github.com/ML2Health/ML2ClinicalTrials",
      "github_url": "https://github.com/ML2Health/ML2ClinicalTrials",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2024-07",
      "last_updated": "2025-03",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 2,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Temporal splits are key improvement.",
      "related_benchmarks": [
        "hint-trialbench"
      ],
      "expert_ids": [
        "yue-wang",
        "tianfan-fu"
      ],
      "group_ids": [
        "ml2health"
      ],
      "hosted_by": [
        "trialbench"
      ],
      "composite_score": 73.4
    },
    {
      "id": "dude",
      "name": "DUD-E",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "virtual-screening"
      ],
      "description": "Directory of Useful Decoys \u2014 Enhanced. 102 targets \u00d7 actives + property-matched decoys.",
      "size": {
        "targets": 102,
        "actives": 22886,
        "decoys": 1411214
      },
      "primary_paper": {
        "title": "Directory of Useful Decoys, Enhanced (DUD-E)",
        "authors": [
          "Mysinger MM",
          "Carchia M",
          "Irwin JJ",
          "Shoichet BK"
        ],
        "year": 2012,
        "doi": "10.1021/jm300687e",
        "citations": 2900
      },
      "official_url": "http://dude.docking.org/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "CC-BY",
      "first_release": "2012",
      "last_updated": "2014",
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 1,
        "adoption": 5,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [
        "data-leakage-known",
        "deprecated-recommend-replace"
      ],
      "notes": "Well-known analog bias in decoy selection; use LIT-PCBA / PLINDER for fair VS.",
      "related_benchmarks": [
        "lit-pcba",
        "dekois",
        "plinder"
      ],
      "expert_ids": [
        "brian-shoichet",
        "john-irwin"
      ],
      "group_ids": [
        "shoichet-lab-ucsf"
      ],
      "hosted_by": [
        "tdc"
      ],
      "composite_score": 72.9
    },
    {
      "id": "moses",
      "name": "MOSES",
      "stages": [
        "lead-id-admet",
        "developmental-candidate"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "molecule-generation"
      ],
      "description": "Molecular Sets \u2014 distribution learning benchmark with 8 reference metrics on ZINC subset.",
      "size": {
        "train_set": 1936962,
        "metrics": 8
      },
      "primary_paper": {
        "title": "Molecular Sets (MOSES): A Benchmarking Platform for Molecular Generation Models",
        "authors": [
          "Polykovskiy D",
          "Zhebrak A",
          "Sanchez-Lengeling B",
          "et al."
        ],
        "year": 2020,
        "doi": "10.3389/fphar.2020.565644",
        "citations": 550
      },
      "official_url": "https://github.com/molecularsets/moses",
      "github_url": "https://github.com/molecularsets/moses",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2018-11",
      "last_updated": "2022-04",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 2,
        "adoption": 5,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [],
      "notes": "Distribution-learning metrics known to saturate.",
      "related_benchmarks": [
        "guacamol",
        "pmo"
      ],
      "expert_ids": [
        "daniil-polykovskiy",
        "alex-zhavoronkov",
        "alan-aspuru-guzik"
      ],
      "group_ids": [
        "insilico-medicine",
        "matter-lab-toronto"
      ],
      "hosted_by": [
        "tdc",
        "moleculenet"
      ],
      "composite_score": 72.4
    },
    {
      "id": "perturbbench",
      "name": "PerturbBench",
      "stages": [
        "virtual-cell"
      ],
      "modalities": [
        "cross-modality"
      ],
      "task_types": [
        "perturbation-prediction"
      ],
      "description": "Benchmark for generalization of perturbation foundation models to unseen genetic perturbations / cell contexts.",
      "size": {
        "cells": 400000,
        "perturbations": 200
      },
      "primary_paper": {
        "title": "PerturbBench: Benchmarking Single-Cell Perturbation Foundation Models",
        "authors": [
          "Wu Y",
          "Barry T",
          "Wang K",
          "et al."
        ],
        "year": 2024,
        "doi": "10.48550/arXiv.2412.10091",
        "citations": 35
      },
      "official_url": "https://github.com/genentech/PerturbBench",
      "github_url": "https://github.com/genentech/PerturbBench",
      "leaderboard_url": "N/A",
      "license": "Apache-2.0",
      "first_release": "2024-12",
      "last_updated": "2025-06",
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 3,
        "adoption": 3,
        "quality": 4,
        "accessibility": 4,
        "industry_relevance": 4
      },
      "flags": [],
      "notes": "Pharma-led (Genentech); well-specified eval.",
      "related_benchmarks": [
        "openproblems-perturbation",
        "scperturb"
      ],
      "expert_ids": [
        "aviv-regev"
      ],
      "group_ids": [
        "genentech-gred"
      ],
      "hosted_by": [],
      "composite_score": 71.4
    },
    {
      "id": "clintox",
      "name": "ClinTox",
      "stages": [
        "lead-id-admet",
        "ind-enabling"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "toxicity-classification"
      ],
      "description": "Binary classification of FDA-approved vs. trial-failed-for-toxicity compounds.",
      "size": {
        "compounds": 1491
      },
      "primary_paper": {
        "title": "Deep learning for drug-induced liver injury",
        "authors": [
          "Xu Y",
          "Dai Z",
          "Chen F",
          "et al."
        ],
        "year": 2015,
        "doi": "10.1021/acs.jcim.5b00238",
        "citations": 380
      },
      "official_url": "https://moleculenet.org/datasets-1",
      "github_url": "https://github.com/deepchem/deepchem",
      "leaderboard_url": "N/A",
      "license": "MIT",
      "first_release": "2015",
      "last_updated": "2022",
      "rubric": {
        "rigor": 3,
        "coverage": 2,
        "maintenance": 2,
        "adoption": 5,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "flags": [
        "data-leakage-known"
      ],
      "notes": "Small, binary; saturated. Useful only as sanity check.",
      "related_benchmarks": [
        "tox21",
        "toxcast"
      ],
      "expert_ids": [
        "bharath-ramsundar"
      ],
      "group_ids": [
        "deepchem"
      ],
      "hosted_by": [
        "moleculenet",
        "tdc"
      ],
      "composite_score": 65.6
    },
    {
      "id": "dekois",
      "name": "DEKOIS 2.0",
      "stages": [
        "hit-id"
      ],
      "modalities": [
        "small-molecule"
      ],
      "task_types": [
        "virtual-screening"
      ],
      "description": "Decoy sets matched to actives by physicochemical properties for structure-based VS.",
      "size": {
        "targets": 81,
        "actives_per_target": 40
      },
      "primary_paper": {
        "title": "DEKOIS 2.0 \u2013 A Public Resource for Benchmarking Structure-based Virtual Screening",
        "authors": [
          "Bauer MR",
          "Ibrahim TM",
          "Vogel SM",
          "Boeckler FM"
        ],
        "year": 2013,
        "doi": "10.1021/ci400115b",
        "citations": 190
      },
      "official_url": "http://www.dekois.com/",
      "github_url": "N/A",
      "leaderboard_url": "N/A",
      "license": "CC-BY",
      "first_release": "2013",
      "last_updated": "2019",
      "rubric": {
        "rigor": 3,
        "coverage": 3,
        "maintenance": 2,
        "adoption": 3,
        "quality": 3,
        "accessibility": 4,
        "industry_relevance": 2
      },
      "flags": [
        "deprecated-recommend-replace"
      ],
      "notes": "Historical reference; use LIT-PCBA / PLINDER for modern VS.",
      "related_benchmarks": [
        "dude",
        "lit-pcba"
      ],
      "expert_ids": [
        "frank-boeckler"
      ],
      "group_ids": [
        "tuebingen-boeckler"
      ],
      "hosted_by": [],
      "composite_score": 57.5
    }
  ]
}