@article{1830, author = {Asubiaro, Toluwase}, title = {Statistical Patterns of Diacritized and Undiacritized Yorùbá Texts}, journal = {International Journal of Computational Linguistics Research}, year = {2015}, volume = {6}, number = {3}, doi = {}, url = {http://www.dline.info/jcl/fulltext/v6n3/v6n3_2.pdf}, abstract = {Yorùbá standard orthography involves heavy use of diacritics for tone marking and representation of characters that are beyond ANSI scope. The diacritics are not always applied in many Yorùbá documents because specialized and language-dependent input devices for the language are very rarely available. Hence, this study aims at explicating the statistical implication of the inconsistency in the use of diacritics in electronic Yoruba documents on the distribution of word in the two versions of its texts. This was achieved by modeling the texts of Yoruba language based on Zipf’s and Heap’s law on the n-grams (for n=1, 2 and 3) with corporal of 1,089,318 words that are diacritically marked and its version that are unmarked diacritically. It was observed that the Zipf’s graphs of the two corporal exhibited no significant difference. On the other hand, the Heap’s graphs of the diacritized and undiacritized texts deviated significantly from the base. This shows that the use of the diacritics significantly affect single word distribution of the language but the effect reduced in the distribution of co-occurrences of two or more words.}, }