@article{4738, author = {Nguyen Minh Tuan}, title = {Comprehensive Exploratory Analysis of Stroke Risk Factors: A Statistical Exposition}, journal = {Journal of Information Organization}, year = {2026}, volume = {16}, number = {2}, doi = {https://doi.org/10.6025/jio/2026/16/2/78-93}, url = {https://www.dline.info/jio/fulltext/v16n2/jiov16n2_3.pdf}, abstract = {Stroke remains a leading cause of global mortality and long term disability, necessitating reliable predictive frameworks for early risk identification. While machine learning applications in stroke forecasting have expanded, many studies bypass rigorous exploratory data analysis, compromising model interpretability and clinical generalizability. This study presents a comprehensive exploratory investigation of demographic, clinical, and lifestyle determinants of stroke using a structured healthcare dataset of 5,110 patient records. Through descriptive statistics, univariate and bivariate analyses, correlation mapping, and stratified prevalence assessments, we systematically evaluated feature distributions, missingness patterns, and multivariate relationships. The cohort exhibited severe class imbalance, with stroke-positive cases comprising only 4.9% of observations. Age emerged as the strongest univariate predictor (r = 0.245), followed by preexisting heart disease, elevated average glucose levels, and hypertension. Conversely, body mass index demonstrated weak independent discriminative power, suggesting its influence operates indirectly through metabolic and cardiovascular pathways. Pairwise visualizations and correlation heatmaps confirmed minimal multicollinearity and highlighted clinically meaningful interaction effects between aging and dysglycemia. These findings establish a statistically rigorous foundation for subsequent machine learning development, emphasizing the necessity of imbalance aware evaluation metrics and targeted feature engineering. By bridging raw clinical data and algorithmic deployment, this work provides a transparent, reproducible roadmap to advance clinically actionable stroke risk stratification tools.}, }