@article{1709, author = {Ali Daud, Umara Noor, Ayesha Manzoor}, title = {Latent Dirichlet Allocation based Semantic Clustering of Heterogeneous Deep Web Sources}, journal = {Progress in Computing Applications}, year = {2014}, volume = {3}, number = {2}, doi = {}, url = {}, abstract = {Over the years a critical increase in the mass of the web has been observed. Among that a large part comprises of online subject-specific databases, hidden behind query interface forms known as deep web. Existing search engines are unable to completely index this highly relevant information due to its large volume. To access deep web content, the research community has proposed to organize it using machine learning techniques. Clustering is one of the key solutions to organize the deep web databases. Existing clustering methods do not encounter semantic relevance among deep web forms. In this paper, we propose a novel method DWSemClust to cluster deep web databases based on the semantic relevance found among deep web forms by employing a generative probabilistic model Latent Dirichlet Allocation (LDA) for modeling content representative of deep web databases. A document comprises of multiple topics, the task of LDA is to cluster words present in the document into “topics”. The purpose of the parameter estimation process in the underlying model is to discover the document’s topic and tell about its proportionate distribution in documents. Deep web has a sparse topic distribution. Due to this reason we have proposed to use LDA that is supposed to be a good clustering solution for the sparse distribution of topics. Further we employ a rich set of metadata as our content representative that comprises of form contents (single attribute/ multiple attributes) and page contents. Experimental results show that our proposed method clearly outperforms the existing non-semantics based clustering methods. }, }