@InProceedings{Saedi-Nia_DMESS2025_20251112, author = {Daniel {Saedi Nia} and Elias C. Massoud and Bharat Sharma and Jitendra Kumar and Nathan Collier and Forrest M. Hoffman}, title = {{ESGF-Assistant}: A Domain-Specific Large Language Model for Navigating {E}arth System Data}, booktitle = {2025 {IEEE} International Conference on Data Mining Workshops ({ICDMW})}, organization = {Institute of Electrical and Electronics Engineers (IEEE)}, publisher = {Conference Publishing Services (CPS)}, pages = {836--845}, doi = {10.1109/ICDMW69685.2025.00100}, day = 12, month = nov, year = 2025, abstract = {Earth system science research depends on large and complex datasets, yet accessing them often requires substantial technical expertise. The Earth System Grid Federation (ESGF) provides distributed access to petabytes of observational and model simulation data, but users unfamiliar with metadata structures or programming face steep barriers. General-purpose large language models (lLMs), while powerful, have shown limited accuracy and reliability for ESGF-specific queries, frequently misinterpreting metadata or producing hallucinated results. To address this gap, we developed a domain-specific LLM trained on curated ESGF instruction-response pairs reflecting realistic user workflows, particularly for intake-esgf code generation. Our approach fine-tunes the LLaMA 3.1(8B) model using parameter-efficient methods, integrates retrieval-augmented generation to ground responses in ESGF and Coupled Model Intercomparison Project (CMIP) metadata, and deploys the assistant through a lightweight, browser-based interface for interactive use. Evaluation using expert review and BERTScore shows that the finetuned model significantly outperforms its untuned counterpart in accuracy, contextual relevance, and usability. This work offers a generalizable framework for applying domain-specific LLMs to complex scientific data infrastructures.} }