@article{522, author = {A.P. Siva Kumar, P. Premchand, A. Govardhan}, title = {TelStem:An Unsupervised Telugu Stemmer with Heuristic Improvements and Normalized Signatures}, journal = {International Journal of Computational Linguistics Research}, year = {2011}, volume = {2}, number = {1}, doi = {}, url = {http://www.dline.info/jcl/fulltext/v2n1/2.pdf}, abstract = {Stemming is a technique for reducing variant forms of a word to their roots (or stems) by enabling extraction of common suffixes. Stem need not correspond to the linguistic root of a word. Stemming is predominantly used in IR system to enrich retrieval effectiveness and to reduce the size of index for information retrieval task. This paper presents a systematic way of algorithm and implementation to develop an unsupervised Telugu stemmer using Take-All-Splits heuristic and improved paradigms. The performance of this stemmer is evaluated by taking two sets of 500 randomly extracted words. The trained Telugu corpus is taken from large set of documents containing 129066 words from CIIL Mysore. The accuracy is found to be 85.40% after applying normalization heuristic. The F-score is 92.94%, the recall is 91.97% and the precision is 96.91%. As the algorithm does not require any language specific information, it can be applied to other languages that are morphologically rich. The percentage reduction in index size for Telugu information retrieval task is also evaluated before and after normalization heuristic}, }