@article{4740,
  author = {Pit Pichappan},
  title = {A Comprehensive Analysis of Code Duplication, Data Leakage, and Clone Detection in Large-Scale Python Corpora},
  journal = {Journal of Intelligent Computing},
  year = {2026},
  volume = {17},
  number = {2},
  doi = {https://doi.org/10.6025/jic/2026/17/2/59-75},
  url = {https://www.dline.info/jic/fulltext/v17n2/jicv17n2_2.pdf},
  abstract = {Code duplication is a pervasive phenomenon in software repositories that poses significant risks for both
software quality and machine learning evaluation. This study presents a comprehensive analysis of code
duplication, data leakage, and clone detection within large-scale Python corpora, focusing on the widely
used py150 benchmark and its declbodies splits. Using a duplication index, we identify 7,336 duplicate
groups comprising 17,033 entries, with an average cluster size of 2.3. Critically, we detect 575 leakage
groups (7.9%) spanning training, validation, and test splits, including 280 test samples and 303 validation
samples that appear verbatim in the training data a violation of dataset independence that can inflate
performance metrics through memorization. To address this, we implement a deduplication strategy that
eliminates all cross-split leakage. We then evaluate binary and multi-class clone detection using traditional
models (SVM, Random Forest) and transformer-based architectures (CodeBERT). Results show that
transformer models achieve strong performance (AUC up to 0.97) on syntactic clones (Types I and II) but
exhibit systematic degradation on semantic clones (Type IV, F1-score 0.72). Statistical tests confirm that
observed differences are significant (p < 0.01). Our findings underscore that uncontrolled redundancy
compromises experimental validity, and we advocate for routine duplication indexing and leakage
quantification as essential preprocessing steps for reliable benchmarking of code intelligence systems.},
}