@article{397, author = {Jonas Sjobergh, Kenji Araki}, title = {What Does 3.3 Mean? Using Informal Evaluation Methods to Relate Formal Evaluation Results and RealWorld Performance}, journal = {International Journal of Computational Linguistics Research}, year = {2010}, volume = {1}, number = {3}, doi = {}, url = {http://www.dline.info/jcl/fulltext/v1n3/8.pdf}, abstract = {We have created an automatic humor generation system for Japanese that generates two man comedy routines or humorous responses to free text input. Evaluating humor is rather difficult since humor is subjective and many factors influence the perceived funniness. We evaluated our system in several ways. First, a traditional evaluation with evaluators ranking comedy performances from 1 (boring) to 5 (funny) gave a result of 3.3. To complement this evaluation and to see if 3.3 is good enough for real world usage we entered a comedy performance created using our system in a funny robot competition with a ¥500,000 prize.We did not win the ¥500,000, but we made it to the final and could use the audience reactions during the live performance for evaluation. We also did sentiment analysis of blog postings covering the competition. These evaluations, while informal, indicate that our system performs competitively compared to the other contestants, that were all handmade systems. That the system could compete against human made contributions shows that a score of 3.3 can be “good enough” for real world applications. We believe that evaluations like these can be useful despite being very informal, since they measure what we want to know: do people expecting something funny think the system is funny?}, }