@article{1871, author = {Chandan Kundu1, Rajib Kumar Das2, Kalyan Sengupta3}, title = {A High Performance Semi-Supervised Learning technique for Non-Standard Word Classification in Bengali News Corpus}, journal = {Journal of Intelligent Computing}, year = {2015}, volume = {6}, number = {3}, doi = {}, url = {http://www.dline.info/jic/fulltext/v6n3/v6n3_1.pdf}, abstract = {The key disadvantage of the supervised learning technique is that it requires many hand-labeled test data to learn the classifier accurately. However, in this dynamic world, neither it is possible always to create database of labeled data, nor it is readily available in hand. Therefore, most of the users of a practical system would prefer algorithms that take few numbers of labeled data. This research paper demonstrates that semi-supervised naïve Bayes classifier using Expectation Maximization algorithm with few labeled data and huge number of inexpensive unlabeled data can create a high-accuracy non-standard word (NSW) classifier. It has been found that low information features contribute little to the accuracy of the naïve Bayes classifier. Therefore, we have eliminated these low information features during the estimation process and applied in the semi-supervised technique, thus provides a high performance model. The performance of the naïve Bayes classifier is good enough when there is huge number of labeled data. However, the EM method dramatically improves the accuracy of a NSW classifier, especially when there are only a few labeled data. We have carried out experiment on Bengali and English news corpus, but this is a general approach that can be applied to any language.}, }