@article{4740, author = {Pit Pichappan}, title = {A Comprehensive Analysis of Code Duplication, Data Leakage, and Clone Detection in Large-Scale Python Corpora}, journal = {Journal of Intelligent Computing}, year = {2026}, volume = {17}, number = {2}, doi = {https://doi.org/10.6025/jic/2026/17/2/59-75}, url = {https://www.dline.info/jic/fulltext/v17n2/jicv17n2_2.pdf}, abstract = {Code duplication is a pervasive phenomenon in software repositories that poses significant risks for both software quality and machine learning evaluation. This study presents a comprehensive analysis of code duplication, data leakage, and clone detection within large-scale Python corpora, focusing on the widely used py150 benchmark and its declbodies splits. Using a duplication index, we identify 7,336 duplicate groups comprising 17,033 entries, with an average cluster size of 2.3. Critically, we detect 575 leakage groups (7.9%) spanning training, validation, and test splits, including 280 test samples and 303 validation samples that appear verbatim in the training data a violation of dataset independence that can inflate performance metrics through memorization. To address this, we implement a deduplication strategy that eliminates all cross-split leakage. We then evaluate binary and multi-class clone detection using traditional models (SVM, Random Forest) and transformer-based architectures (CodeBERT). Results show that transformer models achieve strong performance (AUC up to 0.97) on syntactic clones (Types I and II) but exhibit systematic degradation on semantic clones (Type IV, F1-score 0.72). Statistical tests confirm that observed differences are significant (p < 0.01). Our findings underscore that uncontrolled redundancy compromises experimental validity, and we advocate for routine duplication indexing and leakage quantification as essential preprocessing steps for reliable benchmarking of code intelligence systems.}, }