@article{4577, author = {Xiaojuan Chen}, title = {Multimodal Music Emotion Classification via Stacking-Based Fusion of Audio and Lyric Features with Transfer Learning}, journal = {Journal of E-Technology}, year = {2025}, volume = {16}, number = {4}, doi = {https://doi.org/10.6025/jet/2025/16/4/142-149}, url = {https://www.dline.info/jet/fulltext/v16n4/jetv16n4_3.pdf}, abstract = {This paper proposes a novel multimodal music emotion classification algorithm that integrates audio and lyrical features to overcome the limitations of single modality approaches. Recognizing that music conveys emotion through both sound and text, the system employs deep learning techniques, specifically combining One D CNN and TwoD CNN models with C3D and I3D frameworks for audio processing, alongside text analysis using TF IDF and Word2vec. To effectively fuse these heterogeneous modalities, the study implements a stacking based decision level fusion strategy with a Soft Max secondary classifier, significantly outperforming featurelevel and traditional decision fusion methods. Utilizing transfer learning on datasets like Sport-1M and Kinetics enhances model generalization, while Adam and SGD optimizers improve training efficiency. Experimental results on a dataset of 2000 songs (anger, happiness, relaxation, sadness) demonstrate that the proposed multimodal approach achieves a maximum classification accuracy of 78%, a 4% improvement over singlemodal classifiers and a 2% gain over other fusion techniques. The method effectively mitigates data heterogeneity and over fitting via 5-fold cross validation and addresses challenges in classifying “relaxation” by leveraging complementary audio lyric cues. Evaluation metrics (accuracy, F1-score, ROC/AUC) confirm superior performance, validating that synergistic integration of audio spectral features and semantic lyric representations yields more precise, robust, and scalable music emotion recognition, even with limited labeled data.}, }