@article{4747, author = {Pit Pichappan}, title = {Evaluating Clustering Strategies for Categorical Microbial Data: From K-Means Limitations to Gower-Based Hierarchical Optimization}, journal = {Journal of Data Processing}, year = {2026}, volume = {16}, number = {2}, doi = {https://doi.org/10.6025/jdp/2026/16/2/74-95}, url = {https://www.dline.info/jdp/fulltext/v16n2/jdpv16n2_2.pdf}, abstract = {Clustering categorical microbial data presents significant methodological challenges due to the absence of inherent metric structures and the limitations of conventional numerical algorithms. This study systematically evaluates clustering strategies for a categorical dataset of approximately 200 bacterial species, characterized by taxonomic, ecological, and pathogenic attributes. We compare a traditional K-Means approach applied to one-hot encoded features with a distance-aware hierarchical clustering framework utilizing Gower dissimilarity. Internal validation metrics, bootstrap-based stability analysis, and multidimensional scaling projections are employed to assess clustering quality, robustness, and biological interpretability. Results indicate that K-Means yields a modest silhouette score (0.19) and moderate stability for a two-cluster solution, reflecting the natural ecological continuity and overlapping niches of microbial taxa rather than algorithmic failure. In contrast, Gower-based hierarchical clustering substantially improves cluster cohesion and separation (silhouette score: 0.34), naturally revealing a robust binary partition primarily governed by human pathogenicity. Density-based methods (DBSCAN) perform poorly due to high sparsity and the absence of well-defined neighborhoods in categorical feature spaces. Visualization and centroid analysis confirm that functional traits and ecological specialization outweigh taxonomic family in driving bacterial similarity. This study demonstrates that aligning distance metrics with categorical data structures is essential for uncovering biologically meaningful patterns. Gower-based hierarchical optimization emerges as a superior framework for microbial classification, offering enhanced stability, interpretability, and alignment with contemporary ecological principles.}, }