@inproceedings{9501bb04410b475db3d894e3a2fadf20,
title = "Our Dialogue System Sucks - but Luckily we are at the Top of the Leaderboard!: A Discussion on Current Practices in NLP Evaluation",
abstract = "Currently, leaderboards are often used to evaluate natural language processing (NLP) systems and in particular large language models. In this paper we argue why we should step away from leaderboards and follow a more inclusive approach both in developing as well as in evaluating models. The focus of evaluation should be on the complete context in which the system operates. To accomplish this, researchers should take an inclusive approach and take note of developments in multiple scientific fields (from NLP to communication science).",
keywords = "Evaluation, NLP, leaderboards, multidisciplinary research",
author = "Anouck Braggaar and Linwei He and Wit, {Jan De}",
note = "Publisher Copyright: {\textcopyright} 2024 Owner/Author.",
year = "2024",
month = jul,
day = "8",
doi = "10.1145/3640794.3665889",
language = "English",
series = "Proceedings of the 6th Conference on ACM Conversational User Interfaces, CUI 2024",
booktitle = "Proceedings of the 6th Conference on ACM Conversational User Interfaces, CUI 2024",
}