@article{4695,
  author = {Pit Pichappan},
  title = {LA-MIL: Label-Aware Attention Networks for Multi-Label Multi-Instance Text Classification},
  journal = {Journal of Information Organization},
  year = {2026},
  volume = {16},
  number = {1},
  doi = {https://doi.org/10.6025/jio/2026/16/1/26-39},
  url = {https://www.dline.info/jio/fulltext/v16n1/jiov16n1_3.pdf},
  abstract = {Multilabel multi instance text classification presents unique challenges due to the weak supervision setting
where documents (bags) are labeled but constituent sentences (instances) are not, coupled with severe label
imbalance where infrequent â€œtailâ€ labels dominate real world distributions. Existing approaches typically
employ label agnostic aggregation strategies such as max or mean pooling that implicitly assume uniform
instance relevance across all labels, an assumption that is frequently violated in social tagging data, where
individual sentences often signal specific labels while remaining irrelevant to others. To address this limitation,
we propose LA-MIL (Label-Aware Attention Multi Instance Learning), a novel framework that employs
dedicated attention heads for each label to enable fine grained, label specific instance selection. This
architecture allows different labels to attend to distinct textual evidence within the same document, relaxing
the restrictive assumption of uniform instance relevance. Evaluated on the DeliciousMIL benchmark dataset
comprising 12,234 web documents annotated with 20 semantic tags, LA-MIL consistently outperforms
traditional multi label classifiers, standard MIL models with global pooling, and attention based baselines
with shared aggregation mechanisms. Notably, the model achieves significant improvements in macro-F1
scores, demonstrating superior handling of long tailed label distributions. Beyond quantitative gains, LAMIL
provides inherent interpretability through learned attention weights that transparently identify labeldiscriminative
sentences. Our results establish label aware attention as an essential architectural principle
for multi label multi instance learning, particularly in applications requiring both accuracy on imbalanced
distributions and human interpretable predictions.},
}