@article{4680, author = {Jun Wang}, title = {Comparative Unsupervised Anomaly Detection in Mixed Code - Text Corpora Using Isolation Forest and PCA Autoencoders}, journal = {Progress in Computing Applications}, year = {2026}, volume = {15}, number = {1}, doi = {https://doi.org/10.6025/pca/2026/15/1/30-43}, url = {https://www.dline.info/pca/fulltext/v15n1/pcav15n1_3.pdf}, abstract = {Anomaly detection (AD) in textual data remains challenging due to semantic complexity and the scarcity of labeled anomalous examples. This study proposes a comparative unsupervised framework to identify irregularities within mixed code text corpora, specifically addressing the heterogeneity of technical data from developer forums. We evaluate two distinct paradigms Isolation Forest (density-based isolation) and PCA Autoencoder (reconstruction based error) on a stratified subset of 10,000 Stack Overflow samples, derived from 52,270 total entries and represented via 500 dimensional TF-IDF features. Results indicate that while both methods identified 500 anomalies (5% contamination rate), they exhibited markedly different detection patterns. Isolation Forest showed a strong bias toward natural language, flagging 9.54% of text samples versus 0.08% of code samples. Conversely, the PCA Autoencoder detected a more balanced distribution, identifying 3.15% of code and 6.71% of text samples as anomalous. Agreement between methods was low (~3%), with only 15 consensus anomalies, suggesting complementary notions of irregularity rather than redundant signals. Isolation Forest captured global outliers, such as verbose questions, while the autoencoder detected local irregularities, such as unusual code constructs. These findings underscore the value of multiperspective anomaly detection strategies for comprehensive coverage of irregular patterns. Ultimately, systematic anomaly analysis supports dataset cleaning, model robustness evaluation, and improved preprocessing for downstream tasks, including code understanding and automated question answering systems in software engineering applications}, }