@Article{Hassler_RG_20260301, author = {Birgit Hassler and Forrest M. Hoffman and Rebecca Beadling and Ed Blockley and Bo Huang and Jiwoo Lee and Valerio Lembo and Jared Lewis and Jianhua Lu and Luke Madaus and Elizaveta Malinina and Brian Medeiros and Wilfried Pokam and Enrico Scoccimarro and Ranjini Swaminathan}, title = {Systematic Benchmarking of Climate Models: {M}ethodologies, Applications, and New Directions}, journal = RG, volume = 64, number = 1, pages = {e2025RG000891}, doi = {10.1029/2025RG000891}, day = 1, month = mar, year = 2026, abstract = {As climate models become increasingly complex, there is a growing need to comprehensively and systematically assess model performance with respect to observations. Given the increasing number and diversity of climate model simulations in use, the community has moved beyond simple model intercomparison and toward developing methods capable of benchmarking a large number of simulations against a suite of climate metrics. Here, we present a detailed review of evaluation and benchmarking methods and approaches developed in the last decade, focusing primarily on scientific implications for Coupled Model Intercomparison Project (CMIP) simulations and CMIP6 results that contributed to the Intergovernmental Panel on Climate Change (IPCC) Sixth Assessment Report (AR6). Based on this review, we explain the resulting contemporary philosophy of model benchmarking, and provide clear distinctions and definitions of the terms model verification, process validation, evaluation, and benchmarking. While significant progress has been made in model development based on systematic evaluation and benchmarking efforts, some climate system biases still remain. The development of open-source community software packages has played a fundamental role in identifying areas of significant model improvement and bias reduction. We review the key features of several software packages that have been commonly used over the past decade to evaluate and benchmark global and regional climate models. Additionally, we discuss best practices for the selection of evaluation and benchmarking metrics and for interpreting the obtained results, the importance of selecting suitable sources of reference data and accurate uncertainty quantification.} }